From 83fed4c521680f432317c6c82282d3cb8d7110c2 Mon Sep 17 00:00:00 2001 From: "Joy, Albin" Date: Wed, 10 Sep 2025 15:49:09 +0000 Subject: [PATCH 1/4] Fix all the compilation warnings during build. --- benchmarks/gemm/benchmark_runner.hpp | 12 +++++++----- .../02_bmg_gemm_f16_u4_f16.cpp | 8 ++++---- .../cutlass/gemm/collective/xe_mma_mixed_input.hpp | 4 ++-- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/benchmarks/gemm/benchmark_runner.hpp b/benchmarks/gemm/benchmark_runner.hpp index c7ea70cafa..73e789c8e0 100644 --- a/benchmarks/gemm/benchmark_runner.hpp +++ b/benchmarks/gemm/benchmark_runner.hpp @@ -175,12 +175,12 @@ struct BenchmarkRunnerGemm { using CollectiveMainloop = typename Gemm::GemmKernel::CollectiveMainloop; using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy; - using ElementMma = CollectiveMainloop::TiledMma::ValTypeA; + using ElementMma = typename CollectiveMainloop::TiledMma::ValTypeA; - using ElementScale = ScaleType::type; - using ElementZero = ZeroType::type; - using StrideS = ScaleStride::type; - using StrideZ = ZeroStride::type; + using ElementScale = typename ScaleType::type; + using ElementZero = typename ZeroType::type; + using StrideS = typename ScaleStride::type; + using StrideZ = typename ZeroStride::type; using CollectiveEpilogue = typename Gemm::CollectiveEpilogue; using ElementC = typename Gemm::ElementC; @@ -459,6 +459,7 @@ struct BenchmarkRunnerGemm { TensorRef ref_D(block_ref_D.get(), LayoutD::packed({M, N})); auto [ptr_A, ptr_B] = [&]() { + auto [M, N, K, L] = problem_size; if constexpr (!is_mixed_dtype) { return make_tuple(block_A[0].get(), block_B[0].get()); } else { @@ -474,6 +475,7 @@ struct BenchmarkRunnerGemm { auto shape_scale = cute::make_shape(dq_mn_size, K / 128, L); static constexpr auto k_packed = CollectiveMainloop::zero_elements_packed_along_k; auto shape_zero = [&]() { + auto [M, N, K, L] = problem_size; if constexpr (is_tuple_v(stride_Z))>>) { return cute::make_shape(dq_mn_size, cute::make_shape(k_packed, cute::max(1, K / 128 / k_packed)), L); diff --git a/examples/02_bmg_gemm_mixed_dtype/02_bmg_gemm_f16_u4_f16.cpp b/examples/02_bmg_gemm_mixed_dtype/02_bmg_gemm_f16_u4_f16.cpp index 5aa90672ae..1d47dba7f3 100755 --- a/examples/02_bmg_gemm_mixed_dtype/02_bmg_gemm_f16_u4_f16.cpp +++ b/examples/02_bmg_gemm_mixed_dtype/02_bmg_gemm_f16_u4_f16.cpp @@ -461,8 +461,8 @@ struct ExampleRunner { auto shape_B = cute::make_shape(N, K, L); auto shape_CD = cute::make_shape(M, N, L); auto shape_scale = cute::make_shape(dq_mn_size, scale_k, L); - auto shape_zero = [&]() { - if constexpr (is_tuple_v(stride_Z))>>) { + auto shape_zero = [&, stride_Z_ref = std::ref(stride_Z)]() { + if constexpr (is_tuple_v(stride_Z_ref.get()))>>) { return cute::make_shape(dq_mn_size, cute::make_shape(zero_elements_packed_along_k, cute::max(1, scale_k / zero_elements_packed_along_k)), L); } else { return shape_scale; @@ -474,8 +474,8 @@ struct ExampleRunner { stride_C = cutlass::make_cute_packed_stride(StrideC{}, shape_CD); stride_D = cutlass::make_cute_packed_stride(StrideD{}, shape_CD); stride_S = cutlass::make_cute_packed_stride(StrideScale{}, shape_scale); - stride_Z = [&]() { - if constexpr (is_tuple_v(stride_Z))>>) { + stride_Z = [&, stride_Z_ref = std::ref(stride_Z)]() { + if constexpr (is_tuple_v(stride_Z_ref.get()))>>) { return make_stride(Int{}, make_stride(_1{}, int64_t(zero_elements_packed_along_k * dq_mn_size)), int64_t(dq_mn_size * scale_k)); } else { return stride_S; diff --git a/include/cutlass/gemm/collective/xe_mma_mixed_input.hpp b/include/cutlass/gemm/collective/xe_mma_mixed_input.hpp index 7901545b0b..2066d6747a 100644 --- a/include/cutlass/gemm/collective/xe_mma_mixed_input.hpp +++ b/include/cutlass/gemm/collective/xe_mma_mixed_input.hpp @@ -328,7 +328,7 @@ struct CollectiveMma< }(); if constexpr (ModeScale) { - return Params{tiled_copy_a, tiled_copy_b, tiled_copy_scale, {}, args.group_size}; + return Params{tiled_copy_a, tiled_copy_b, {tiled_copy_scale}, {}, args.group_size}; } else { auto ptr_Z = [&]() { if constexpr (sizeof_bits_v < 8) { @@ -353,7 +353,7 @@ struct CollectiveMma< } }(); - return Params{tiled_copy_a, tiled_copy_b, tiled_copy_scale, tiled_copy_zero, args.group_size}; + return Params{tiled_copy_a, tiled_copy_b, {tiled_copy_scale}, {tiled_copy_zero}, args.group_size}; } } } From f2b09dfe222037815e0d4e600427c63b77fecb25 Mon Sep 17 00:00:00 2001 From: Nitin Singh Date: Thu, 11 Sep 2025 12:28:14 +0000 Subject: [PATCH 2/4] Handle 3 compile time warnings 1. Tuple bindings passed by ref to lambda calls not natively handled by C++17 2. LOG_THREAD re-definition 3. Ignore unused build args --- CMakeLists.txt | 1 + .../02_bmg_gemm_f16_u4_f16.cpp | 15 ++++++++++----- test/unit/cute/intel_xe/mma.cpp | 1 + test/unit/cute/intel_xe/utils.hpp | 1 + 4 files changed, 13 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7023721a2c..a631803f7c 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -121,6 +121,7 @@ if (CUTLASS_ENABLE_SYCL) DPCPP_SYCL_TARGET STREQUAL "intel_gpu_bmg_g21") set(SYCL_INTEL_TARGET ON) add_compile_definitions(SYCL_INTEL_TARGET) + add_compile_options(-Wno-unused-command-line-argument) endif() add_compile_definitions(CUTLASS_ENABLE_SYCL) diff --git a/examples/02_bmg_gemm_mixed_dtype/02_bmg_gemm_f16_u4_f16.cpp b/examples/02_bmg_gemm_mixed_dtype/02_bmg_gemm_f16_u4_f16.cpp index 1d47dba7f3..0fb8be1470 100755 --- a/examples/02_bmg_gemm_mixed_dtype/02_bmg_gemm_f16_u4_f16.cpp +++ b/examples/02_bmg_gemm_mixed_dtype/02_bmg_gemm_f16_u4_f16.cpp @@ -452,7 +452,12 @@ struct ExampleRunner { /// Initialize operands to be used in the GEMM and reference GEMM void initialize(Options const& options) { - auto [M, N, K, L] = ProblemShapeType{options.m, options.n, options.k, options.l}; + // auto [M, N, K, L] = ProblemShapeType{options.m, options.n, options.k, options.l}; + auto problem_shape = ProblemShapeType{options.m, options.n, options.k, options.l}; + auto& M = cute::get<0>(problem_shape); + auto& N = cute::get<1>(problem_shape); + auto& K = cute::get<2>(problem_shape); + auto& L = cute::get<3>(problem_shape); auto zero_elements_packed_along_k = get<0>(StrideZero{}); const int scale_k = cute::ceil_div(options.k, options.g); @@ -461,8 +466,8 @@ struct ExampleRunner { auto shape_B = cute::make_shape(N, K, L); auto shape_CD = cute::make_shape(M, N, L); auto shape_scale = cute::make_shape(dq_mn_size, scale_k, L); - auto shape_zero = [&, stride_Z_ref = std::ref(stride_Z)]() { - if constexpr (is_tuple_v(stride_Z_ref.get()))>>) { + auto shape_zero = [&]() { + if constexpr (is_tuple_v(stride_Z))>>) { return cute::make_shape(dq_mn_size, cute::make_shape(zero_elements_packed_along_k, cute::max(1, scale_k / zero_elements_packed_along_k)), L); } else { return shape_scale; @@ -474,8 +479,8 @@ struct ExampleRunner { stride_C = cutlass::make_cute_packed_stride(StrideC{}, shape_CD); stride_D = cutlass::make_cute_packed_stride(StrideD{}, shape_CD); stride_S = cutlass::make_cute_packed_stride(StrideScale{}, shape_scale); - stride_Z = [&, stride_Z_ref = std::ref(stride_Z)]() { - if constexpr (is_tuple_v(stride_Z_ref.get()))>>) { + stride_Z = [&]() { + if constexpr (is_tuple_v(stride_Z))>>) { return make_stride(Int{}, make_stride(_1{}, int64_t(zero_elements_packed_along_k * dq_mn_size)), int64_t(dq_mn_size * scale_k)); } else { return stride_S; diff --git a/test/unit/cute/intel_xe/mma.cpp b/test/unit/cute/intel_xe/mma.cpp index 1c0e3d8a61..8bbd4bb157 100755 --- a/test/unit/cute/intel_xe/mma.cpp +++ b/test/unit/cute/intel_xe/mma.cpp @@ -89,6 +89,7 @@ void gemm_device(TA const *A, TB const *B, TC *C, uint32_t m, uint32_t n, #define CUTLASS_ENABLE_DEBUG_PRINTS (0) +#undef LOG_THREAD #define LOG_THREAD (16) #if CUTLASS_ENABLE_DEBUG_PRINTS diff --git a/test/unit/cute/intel_xe/utils.hpp b/test/unit/cute/intel_xe/utils.hpp index e109d9fe27..d4597912a3 100755 --- a/test/unit/cute/intel_xe/utils.hpp +++ b/test/unit/cute/intel_xe/utils.hpp @@ -52,6 +52,7 @@ using namespace syclcompat::experimental; #define CUTLASS_ENABLE_DEBUG_PRINTS (0) #define LOG_GROUP (0) +#undef LOG_THREAD #define LOG_THREAD (0) template From 57ff558e884fa5e9a07f80959c396ab5b765b71d Mon Sep 17 00:00:00 2001 From: Nitin Singh Date: Fri, 12 Sep 2025 05:51:17 +0000 Subject: [PATCH 3/4] Use a distinct link time only flag --- CMakeLists.txt | 1 - benchmarks/gemm/benchmark_runner.hpp | 7 ++++--- cmake/FindDPCPP.cmake | 15 +++++++++------ .../02_bmg_gemm_f16_u4_f16.cpp | 1 - 4 files changed, 13 insertions(+), 11 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a631803f7c..7023721a2c 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -121,7 +121,6 @@ if (CUTLASS_ENABLE_SYCL) DPCPP_SYCL_TARGET STREQUAL "intel_gpu_bmg_g21") set(SYCL_INTEL_TARGET ON) add_compile_definitions(SYCL_INTEL_TARGET) - add_compile_options(-Wno-unused-command-line-argument) endif() add_compile_definitions(CUTLASS_ENABLE_SYCL) diff --git a/benchmarks/gemm/benchmark_runner.hpp b/benchmarks/gemm/benchmark_runner.hpp index 73e789c8e0..5576411220 100644 --- a/benchmarks/gemm/benchmark_runner.hpp +++ b/benchmarks/gemm/benchmark_runner.hpp @@ -453,13 +453,15 @@ struct BenchmarkRunnerGemm { } bool verify(const ProblemShapeType& problem_size, ElementCompute alpha, ElementCompute beta) { - auto [M, N, K, L] = problem_size; + auto& M = cute::get<0>(problem_size); + auto& N = cute::get<1>(problem_size); + auto& K = cute::get<2>(problem_size); + auto& L = cute::get<3>(problem_size); TensorRef ref_C(block_C[0].get(), LayoutC::packed({M, N})); TensorRef ref_D(block_ref_D.get(), LayoutD::packed({M, N})); auto [ptr_A, ptr_B] = [&]() { - auto [M, N, K, L] = problem_size; if constexpr (!is_mixed_dtype) { return make_tuple(block_A[0].get(), block_B[0].get()); } else { @@ -475,7 +477,6 @@ struct BenchmarkRunnerGemm { auto shape_scale = cute::make_shape(dq_mn_size, K / 128, L); static constexpr auto k_packed = CollectiveMainloop::zero_elements_packed_along_k; auto shape_zero = [&]() { - auto [M, N, K, L] = problem_size; if constexpr (is_tuple_v(stride_Z))>>) { return cute::make_shape(dq_mn_size, cute::make_shape(k_packed, cute::max(1, K / 128 / k_packed)), L); diff --git a/cmake/FindDPCPP.cmake b/cmake/FindDPCPP.cmake index 9f45285cdc..9acaa49f18 100644 --- a/cmake/FindDPCPP.cmake +++ b/cmake/FindDPCPP.cmake @@ -40,6 +40,7 @@ add_library(DPCPP::DPCPP INTERFACE IMPORTED) set(DPCPP_FLAGS "-fsycl;") set(DPCPP_COMPILE_ONLY_FLAGS "") +set(DPCPP_LINK_ONLY_FLAGS "") if(NOT "${DPCPP_SYCL_TARGET}" STREQUAL "") list(APPEND DPCPP_FLAGS "-fsycl-targets=${DPCPP_SYCL_TARGET};") @@ -63,10 +64,10 @@ if("${DPCPP_SYCL_TARGET}" STREQUAL "intel_gpu_pvc" OR "${DPCPP_SYCL_TARGET}" STREQUAL "spir64" OR "${DPCPP_SYCL_TARGET}" STREQUAL "intel_gpu_bmg_g21") if ((CMAKE_CXX_COMPILER_ID MATCHES "IntelLLVM" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 2025.2) OR CUTLASS_SYCL_BUILTIN_ENABLE) - list(APPEND DPCPP_FLAGS "-Xspirv-translator;-spirv-ext=+SPV_INTEL_split_barrier") + list(APPEND DPCPP_LINK_ONLY_FLAGS "-Xspirv-translator;-spirv-ext=+SPV_INTEL_split_barrier") else() - list(APPEND DPCPP_FLAGS "-Xspirv-translator;-spirv-ext=+SPV_INTEL_split_barrier,+SPV_INTEL_2d_block_io,+SPV_INTEL_subgroup_matrix_multiply_accumulate") - endif() + list(APPEND DPCPP_LINK_ONLY_FLAGS "-Xspirv-translator;-spirv-ext=+SPV_INTEL_split_barrier,+SPV_INTEL_2d_block_io,+SPV_INTEL_subgroup_matrix_multiply_accumulate") + endif() if(DPCPP_DISABLE_ITT_FOR_CUTLASS) list(APPEND DPCPP_FLAGS "-fno-sycl-instrument-device-code") endif() @@ -76,14 +77,16 @@ endif() if(UNIX) set_target_properties(DPCPP::DPCPP PROPERTIES INTERFACE_COMPILE_OPTIONS "${DPCPP_FLAGS};${DPCPP_COMPILE_ONLY_FLAGS}" - INTERFACE_LINK_OPTIONS "${DPCPP_FLAGS}" + INTERFACE_LINK_OPTIONS "${DPCPP_FLAGS};${DPCPP_LINK_ONLY_FLAGS}" INTERFACE_LINK_LIBRARIES ${DPCPP_LIB_DIR} INTERFACE_INCLUDE_DIRECTORIES "${DPCPP_BIN_DIR}/../include/sycl;${DPCPP_BIN_DIR}/../include") message(STATUS "DPCPP INCLUDE DIR: ${DPCPP_BIN_DIR}/../include/sycl;${DPCPP_BIN_DIR}/../include") - message(STATUS "Using DPCPP flags: ${DPCPP_FLAGS};${DPCPP_COMPILE_ONLY_FLAGS}") + message(STATUS "Using DPCPP compile flags: ${DPCPP_FLAGS};${DPCPP_COMPILE_ONLY_FLAGS}") + message(STATUS "Using DPCPP link flags: ${DPCPP_FLAGS};${DPCPP_LINK_ONLY_FLAGS}") else() set_target_properties(DPCPP::DPCPP PROPERTIES INTERFACE_COMPILE_OPTIONS "${DPCPP_FLAGS};${DPCPP_COMPILE_ONLY_FLAGS}" + INTERFACE_LINK_OPTIONS "${DPCPP_FLAGS};${DPCPP_LINK_ONLY_FLAGS}" INTERFACE_LINK_LIBRARIES ${DPCPP_LIB_DIR} INTERFACE_INCLUDE_DIRECTORIES "${DPCPP_BIN_DIR}/../include/sycl") endif() @@ -105,7 +108,7 @@ function(add_sycl_to_target) ) get_target_property(target_type ${CUTLASS_ADD_SYCL_TARGET} TYPE) if (NOT target_type STREQUAL "OBJECT_LIBRARY") - target_link_options(${CUTLASS_ADD_SYCL_TARGET} PUBLIC ${DPCPP_FLAGS}) + target_link_options(${CUTLASS_ADD_SYCL_TARGET} PUBLIC ${DPCPP_FLAGS} ${DPCPP_LINK_ONLY_FLAGS}) endif() endfunction() diff --git a/examples/02_bmg_gemm_mixed_dtype/02_bmg_gemm_f16_u4_f16.cpp b/examples/02_bmg_gemm_mixed_dtype/02_bmg_gemm_f16_u4_f16.cpp index 0fb8be1470..c84658eec8 100755 --- a/examples/02_bmg_gemm_mixed_dtype/02_bmg_gemm_f16_u4_f16.cpp +++ b/examples/02_bmg_gemm_mixed_dtype/02_bmg_gemm_f16_u4_f16.cpp @@ -452,7 +452,6 @@ struct ExampleRunner { /// Initialize operands to be used in the GEMM and reference GEMM void initialize(Options const& options) { - // auto [M, N, K, L] = ProblemShapeType{options.m, options.n, options.k, options.l}; auto problem_shape = ProblemShapeType{options.m, options.n, options.k, options.l}; auto& M = cute::get<0>(problem_shape); auto& N = cute::get<1>(problem_shape); From 6c751ced7a45b3044807aa6190883d5a6351cf84 Mon Sep 17 00:00:00 2001 From: Nitin Singh Date: Mon, 15 Sep 2025 16:54:56 +0000 Subject: [PATCH 4/4] Handle warnings from gtest 1) Add -Wno-unknown-warning-option to handle "unknown warning option '-Wno-implicit-float-size-conversion'" 2) Add Winline to "don't raise error for this warning list" for Werror --- cmake/googletest.cmake | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/cmake/googletest.cmake b/cmake/googletest.cmake index 5249b328fd..6912bef5d3 100644 --- a/cmake/googletest.cmake +++ b/cmake/googletest.cmake @@ -44,6 +44,15 @@ FetchContent_Declare( FetchContent_MakeAvailable(googletest) +if (CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM") + if (TARGET gtest) + # Ignore unsupported warning flags on IntelLLVM + target_compile_options(gtest PRIVATE -Wno-unknown-warning-option) + # Show -Winline warnings, but don’t let them become errors + target_compile_options(gtest PRIVATE -Wno-error=inline) + endif() +endif() + if (MSVC) set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) -endif() +endif() \ No newline at end of file