From 2c611c7c491fd82512e3912ef98e7c3b7f32a245 Mon Sep 17 00:00:00 2001
From: CaiZhi <zhi.cai@mthreads.com>
Date: Mon, 24 Jul 2023 12:12:15 +0000
Subject: [PATCH 01/55] [MTAI] build(system): enable build system in paddle for
 MUSA

---
 CMakeLists.txt                                |  31 ++
 cmake/flags.cmake                             |   4 -
 cmake/generic.cmake                           |  68 ++++
 cmake/mccl.cmake                              |  31 ++
 cmake/mudnn.cmake                             |  66 ++++
 cmake/musa.cmake                              |  33 ++
 .../distributed/fleet_executor/carrier.cc     |   2 +-
 .../fleet_executor/cond_interceptor.cc        |   2 +-
 .../distributed/fleet_executor/dist_model.cc  |   2 +-
 .../eager_generated/backwards/scale_node.cc   |   2 +-
 .../generator/python_c_gen.py                 |   2 +-
 paddle/fluid/eager/nan_inf_utils.cc           |   2 +-
 paddle/fluid/framework/conv_search_cache.h    |   8 +-
 .../fluid/framework/copy_same_tensor_test.cc  |   2 +-
 paddle/fluid/framework/custom_operator.cc     |   4 +-
 paddle/fluid/framework/data_feed.cc           |   2 +-
 paddle/fluid/framework/data_feed.h            |   2 +-
 paddle/fluid/framework/data_feed_factory.cc   |   2 +-
 .../fluid/framework/details/build_strategy.cc |   4 +-
 .../details/eager_deletion_op_handle.cc       |  27 +-
 .../details/eager_deletion_op_handle.h        |   2 +-
 .../details/fetch_async_op_handle.cc          |   2 +-
 .../framework/details/fetch_op_handle.cc      |   2 +-
 .../details/fused_all_reduce_op_handle.cc     |  27 +-
 .../details/gather_op_handle_test.cc          |   4 +-
 .../framework/details/nan_inf_utils_detail.cc |   2 +-
 .../fluid/framework/details/op_handle_base.cc |  46 ++-
 .../fluid/framework/details/op_handle_base.h  |   2 +-
 .../details/reduce_op_handle_test.cc          |   2 +-
 .../details/scale_loss_grad_op_handle.cc      |   4 +-
 .../details/share_tensor_buffer_op_handle.cc  |   2 +-
 paddle/fluid/framework/details/var_handle.h   |   4 +-
 paddle/fluid/framework/device_worker.h        |  14 +-
 paddle/fluid/framework/dlpack_tensor.cc       |   4 +-
 paddle/fluid/framework/dlpack_tensor_test.cc  |   2 +-
 paddle/fluid/framework/executor.cc            |   2 +-
 paddle/fluid/framework/fleet/box_wrapper.cu   |   7 +-
 paddle/fluid/framework/fleet/box_wrapper.h    |   5 +-
 .../fluid/framework/fleet/box_wrapper_impl.h  |  15 +-
 paddle/fluid/framework/fleet/fleet_wrapper.cc |   7 +-
 paddle/fluid/framework/fleet/fleet_wrapper.h  |   2 +-
 paddle/fluid/framework/fleet/heter_wrapper.cc |   6 +-
 paddle/fluid/framework/fleet/heter_wrapper.h  |   2 +-
 paddle/fluid/framework/garbage_collector.cc   |   8 +-
 paddle/fluid/framework/garbage_collector.h    |   2 +-
 paddle/fluid/framework/ir/cost_model.cc       |   4 +-
 paddle/fluid/framework/ir/fuse_bn_act_pass.cc |   2 +-
 .../framework/ir/fuse_bn_add_act_pass.cc      |   2 +-
 .../ir/fusion_group/code_generator_tester.cc  |   2 +-
 ...est_reference_count_pass_last_lived_ops.cc |   2 +-
 .../interpreter/execution_config.cc           |   2 +-
 .../interpreter/interpreter_util.cc           |   2 +-
 .../new_executor/interpreter_base_impl.h      |   2 +-
 .../new_executor/new_ir_interpreter.cc        |   4 +-
 .../fluid/framework/new_executor/profiler.h   |   2 +-
 .../new_executor/program_interpreter.cc       |   4 +-
 paddle/fluid/framework/op_registry.h          |   2 +-
 paddle/fluid/framework/operator.cc            |  12 +-
 paddle/fluid/framework/operator.h             |   2 +-
 paddle/fluid/framework/parallel_executor.cc   |  26 +-
 paddle/fluid/framework/phi_utils.cc           |   2 +-
 paddle/fluid/framework/phi_utils.h            |   2 +-
 paddle/fluid/framework/pull_dense_worker.cc   |  14 +-
 paddle/fluid/framework/section_worker.cc      |   2 +-
 paddle/fluid/framework/tensor_test.cc         |  10 +-
 paddle/fluid/framework/tensor_util.cc         |  14 +-
 paddle/fluid/framework/tensor_util.h          |   8 +-
 paddle/fluid/framework/tensor_util_test.cc    |  10 +-
 paddle/fluid/framework/trainer.h              |   6 +-
 paddle/fluid/framework/var_type_traits.h      |   4 +-
 paddle/fluid/imperative/amp_auto_cast.cc      |   2 +-
 .../fluid/imperative/gradient_accumulator.cc  |  18 +-
 paddle/fluid/imperative/prepared_operator.cc  |   6 +-
 paddle/fluid/imperative/tracer.cc             |   6 +-
 .../ir_params_sync_among_devices_pass.cc      |   4 +-
 .../ir_params_sync_among_devices_pass.h       |   2 +-
 paddle/fluid/inference/api/analysis_config.cc |  10 +-
 .../fluid/inference/api/analysis_predictor.cc |  28 +-
 .../fluid/inference/api/analysis_predictor.h  |   2 +-
 paddle/fluid/inference/api/api_impl.cc        |   2 +-
 .../inference/api/details/zero_copy_tensor.cc |  14 +-
 paddle/fluid/inference/api/infer_context.cc   |   2 +-
 paddle/fluid/inference/api/infer_context.h    |   2 +-
 .../fluid/inference/api/resource_manager.cc   |  15 +-
 paddle/fluid/inference/api/resource_manager.h |   6 +-
 paddle/fluid/inference/lite/tensor_utils.cc   |   2 +-
 .../tensorrt/plugin/qkv_to_context_plugin.cu  |   5 +-
 paddle/fluid/memory/allocation/CMakeLists.txt |   2 +-
 .../memory/allocation/allocator_facade.cc     |  28 +-
 .../memory/allocation/allocator_facade.h      |   2 +-
 .../allocator_facade_abs_flags_test.cc        |   6 +-
 .../allocator_facade_frac_flags_test.cc       |   6 +-
 ...o_growth_best_fit_allocator_facade_test.cc |   6 +-
 .../memory/allocation/buddy_allocator.cc      |   6 +-
 .../memory/allocation/buddy_allocator_test.cc |   4 +-
 .../fluid/memory/allocation/cuda_allocator.cc |   6 +
 .../cuda_device_context_allocator.h           |  15 +-
 .../allocation/cuda_managed_allocator.cc      |   5 +
 .../allocation/naive_best_fit_allocator.cc    |  28 +-
 .../naive_best_fit_allocator_test.cc          |   2 +-
 .../memory/allocation/pinned_allocator.cc     |   8 +-
 .../memory/allocation/retry_allocator_test.cc |   4 +-
 .../memory/allocation/system_allocator.cc     |   8 +-
 .../memory/allocation/system_allocator.h      |   2 +-
 .../allocation/system_allocator_test.cc       |   2 +-
 paddle/fluid/memory/malloc.cc                 |   2 +-
 paddle/fluid/memory/malloc.h                  |   2 +-
 paddle/fluid/memory/memcpy.cc                 |  78 ++++-
 paddle/fluid/memory/memory_stats_test.cc      |   2 +-
 .../fluid/operators/array_to_lod_tensor_op.cc |   2 +-
 .../fluid/operators/class_center_sample_op.cu |  14 +-
 .../collective/c_sync_calc_stream_op.h        |   2 +-
 .../operators/collective/c_wait_comm_op.cc    |   5 +-
 .../operators/collective/c_wait_compute_op.cc |   5 +-
 .../controlflow/conditional_block_op.h        |   2 +-
 paddle/fluid/operators/controlflow/feed_op.cc |   2 +-
 .../operators/controlflow/get_places_op.cc    |   4 +-
 .../operators/controlflow/while_op_helper.cc  |   2 +-
 .../operators/detection/target_assign_op.h    |   8 +-
 paddle/fluid/operators/dgc_op.h               |   2 +-
 paddle/fluid/operators/expand_as_op.cc        |   2 +-
 paddle/fluid/operators/expand_op.cc           |   2 +-
 paddle/fluid/operators/fake_quantize_op.cu.h  |   4 +-
 .../fused_embedding_eltwise_layernorm_op.cu   |  17 +-
 .../fused_softmax_mask_upper_triangle_op.cu   |   4 +
 .../get_tensor_from_selected_rows_op.cc       |   2 +-
 .../fluid/operators/graph_khop_sampler_op.cu  |  17 +-
 paddle/fluid/operators/hinge_loss_op.cc       |   2 +-
 paddle/fluid/operators/im2sequence_op.cc      |   2 +-
 paddle/fluid/operators/isfinite_op.h          |   8 +-
 paddle/fluid/operators/l1_norm_op.cc          |   2 +-
 paddle/fluid/operators/load_op.cc             |   2 +-
 .../fluid/operators/lod_tensor_to_array_op.cc |   2 +-
 paddle/fluid/operators/lookup_table_v2_op.cu  |   5 +-
 .../operators/margin_cross_entropy_op.cu      |   4 +-
 .../operators/math/bert_encoder_functor.h     |   2 +-
 paddle/fluid/operators/math/prelu.h           |   2 +-
 paddle/fluid/operators/math/sample_prob.h     |   2 +-
 paddle/fluid/operators/matmul_op.cc           |   2 +-
 paddle/fluid/operators/memcpy_h2d_op.h        |   2 +-
 paddle/fluid/operators/merge_lod_tensor_op.cc |   2 +-
 paddle/fluid/operators/minus_op.cc            |   2 +-
 paddle/fluid/operators/nop_op.cc              |   2 +-
 .../fluid/operators/pad_constant_like_op.cc   |   2 +-
 .../operators/pscore/send_and_recv_op.cc      |   2 +-
 paddle/fluid/operators/random_crop_op.h       |   4 +-
 paddle/fluid/operators/rank_loss_op.cc        |   2 +-
 .../fluid/operators/reader/buffered_reader.cc |   4 +-
 .../fluid/operators/reader/buffered_reader.h  |   4 +-
 paddle/fluid/operators/reshape_op.cc          |   8 +-
 paddle/fluid/operators/save_op.cc             |   2 +-
 paddle/fluid/operators/select_op_helper.h     |   2 +-
 .../sequence_ops/sequence_reverse_op.h        |   4 +-
 .../sequence_ops/sequence_softmax_op.cc       |   2 +-
 paddle/fluid/operators/shuffle_batch_op.cu    |   2 +-
 paddle/fluid/operators/split_lod_tensor_op.cc |   2 +-
 paddle/fluid/operators/sync_batch_norm_op.cu  |  50 ++-
 paddle/fluid/platform/complex_test.cu         |   2 +-
 paddle/fluid/platform/device/device_wrapper.h |   2 +-
 paddle/fluid/platform/device/gpu/gpu_dnn.h    |   2 +-
 paddle/fluid/platform/device/gpu/gpu_helper.h |   2 +-
 paddle/fluid/platform/device/gpu/gpu_info.cc  |   7 +
 paddle/fluid/platform/device/gpu/gpu_info.h   |   2 +-
 .../platform/device/gpu/gpu_launch_config.h   |   2 +-
 .../platform/device/gpu/gpu_resource_pool.cc  |   2 +-
 .../platform/device/gpu/gpu_resource_pool.h   |   2 +-
 paddle/fluid/platform/device/gpu/gpu_types.h  |  12 +-
 paddle/fluid/platform/device_code_test.cc     |   2 +-
 paddle/fluid/platform/device_context.cc       |  10 +-
 paddle/fluid/platform/device_context.h        |   4 +-
 paddle/fluid/platform/device_event.h          |   2 +-
 paddle/fluid/platform/device_event_gpu.cc     |   2 +-
 paddle/fluid/platform/enforce.h               |   2 +-
 paddle/fluid/platform/enforce_test.cc         |   2 +-
 paddle/fluid/platform/init.cc                 |  10 +-
 paddle/fluid/platform/init_test.cc            |   2 +-
 paddle/fluid/platform/place.h                 |   4 +-
 paddle/fluid/platform/profiler.cc             |   2 +-
 paddle/fluid/platform/profiler.h              |   4 +-
 .../platform/profiler/chrometracing_logger.cc |   2 +-
 .../platform/profiler/chrometracing_logger.h  |   2 +-
 .../profiler/dump/deserialization_reader.cc   |   4 +-
 .../profiler/dump/deserialization_reader.h    |   2 +-
 .../profiler/dump/serialization_logger.cc     |   2 +-
 .../profiler/dump/serialization_logger.h      |   2 +-
 .../fluid/platform/profiler/event_python.cc   |   6 +-
 paddle/fluid/platform/profiler/event_python.h |   6 +-
 paddle/fluid/platform/profiler/profiler.cc    |   4 +-
 paddle/fluid/platform/profiler_helper.h       |   4 +-
 paddle/fluid/platform/profiler_test.cc        |   2 +-
 paddle/fluid/pybind/cuda_streams_py.cc        |  20 +-
 paddle/fluid/pybind/cuda_streams_py.h         |   4 +-
 paddle/fluid/pybind/eager_functions.cc        |   2 +-
 paddle/fluid/pybind/eager_math_op_patch.cc    |   2 +-
 paddle/fluid/pybind/eager_method.cc           |   4 +-
 paddle/fluid/pybind/generator_py.cc           |   2 +-
 paddle/fluid/pybind/inference_api.cc          |  12 +-
 paddle/fluid/pybind/parallel_executor.cc      |   2 +-
 paddle/fluid/pybind/place.cc                  |  16 +-
 paddle/fluid/pybind/process_group_utils.h     |   4 +-
 paddle/fluid/pybind/pybind.cc                 |  12 +-
 paddle/fluid/pybind/tensor.cc                 |   2 +-
 paddle/fluid/pybind/tensor_py.h               |  12 +-
 paddle/phi/CMakeLists.txt                     |   3 +
 paddle/phi/api/include/context_pool.h         |   2 +-
 paddle/phi/api/include/tensor.h               |   2 +-
 paddle/phi/api/lib/context_pool.cc            |   4 +-
 paddle/phi/api/lib/data_transform.cc          |   6 +-
 paddle/phi/api/lib/tensor.cc                  |   2 +-
 paddle/phi/api/lib/tensor_utils.cc            |   4 +-
 paddle/phi/api/profiler/event.h               |   8 +-
 paddle/phi/backends/CMakeLists.txt            |   6 +-
 paddle/phi/backends/context_pool.cc           |   2 +-
 paddle/phi/backends/context_pool.h            |   4 +-
 paddle/phi/backends/device_code.cc            |   6 +-
 paddle/phi/backends/device_code.h             |   2 +-
 paddle/phi/backends/device_memory_aligment.h  |   2 +-
 paddle/phi/backends/gpu/gpu_context.cc        |   2 +-
 paddle/phi/backends/gpu/gpu_context.h         |   6 +-
 paddle/phi/backends/gpu/gpu_device_function.h |   2 +-
 paddle/phi/backends/gpu/gpu_dnn.h             |   2 +-
 paddle/phi/backends/gpu/gpu_helper.h          |   2 +-
 paddle/phi/backends/gpu/gpu_info.h            |   2 +-
 paddle/phi/backends/gpu/gpu_launch_config.h   |   2 +-
 paddle/phi/backends/gpu/gpu_types.h           |   4 +-
 paddle/phi/backends/gpu/musa/musa_info.cc     | 329 ++++++++++++++++++
 paddle/phi/capi/lib/c_device_context.cc       |   2 +-
 paddle/phi/capi/lib/c_kernel_context.cc       |   2 +-
 paddle/phi/common/backend.h                   |   2 +-
 paddle/phi/common/complex.h                   |   4 +-
 paddle/phi/common/float16.h                   |   2 +-
 paddle/phi/common/memory_utils.cc             |   2 +-
 paddle/phi/common/memory_utils.h              |   6 +-
 paddle/phi/common/place.cc                    |   4 +-
 paddle/phi/core/compat/convert_utils.cc       |   6 +-
 paddle/phi/core/cuda_stream.h                 |  18 +
 paddle/phi/core/enforce.h                     |   2 +-
 paddle/phi/core/flags.cc                      |  16 +-
 paddle/phi/core/generator.cc                  |   4 +-
 paddle/phi/core/kernel_factory.cc             |   4 +-
 paddle/phi/core/kernel_registry.h             |   4 +-
 paddle/phi/core/kernel_utils.h                |   2 +-
 paddle/phi/core/mixed_vector.cc               |   4 +-
 paddle/phi/core/string_tensor.cc              |   2 +-
 paddle/phi/core/tensor_utils.cc               |  16 +-
 paddle/phi/core/utils/type_info.cc            |   4 +-
 paddle/phi/core/utils/visit_place.h           |   4 +-
 paddle/phi/infermeta/multiary.cc              |   2 +-
 paddle/phi/kernels/CMakeLists.txt             |   2 +-
 paddle/phi/kernels/activation_kernel.cc       |   2 +-
 paddle/phi/kernels/assign_kernel.cc           |   2 +-
 .../kernels/check_memory_continue_kernel.cc   |   2 +-
 paddle/phi/kernels/dist_grad_kernel.cc        |   2 +-
 paddle/phi/kernels/empty_kernel.cc            |   2 +-
 paddle/phi/kernels/flatten_grad_kernel.cc     |   2 +-
 paddle/phi/kernels/flatten_kernel.cc          |   2 +-
 paddle/phi/kernels/full_kernel.cc             |   2 +-
 paddle/phi/kernels/funcs/CMakeLists.txt       |   2 +-
 paddle/phi/kernels/funcs/blas/blas.h          |   4 +-
 .../phi/kernels/funcs/detail/strided_memcpy.h |   6 +-
 paddle/phi/kernels/funcs/layer_norm_util.h    |   4 +-
 paddle/phi/kernels/funcs/math_function.cc     |   2 +-
 paddle/phi/kernels/funcs/math_function.h      |   2 +-
 paddle/phi/kernels/funcs/pooling.h            |   6 +-
 paddle/phi/kernels/funcs/select_impl.cu.h     |   2 +-
 paddle/phi/kernels/funcs/softmax.h            |   2 +-
 paddle/phi/kernels/funcs/strided_memcpy.h     |   2 +-
 .../fusion/gpu/fused_softmax_mask_utils.h     |   2 +-
 paddle/phi/kernels/gpu/batch_norm_kernel.cu   |   2 +-
 paddle/phi/kernels/gpu/reduce.h               |   2 +-
 paddle/phi/kernels/gpu/reduce_grad.h          |   2 +-
 paddle/phi/kernels/group_norm_kernel.h        |   2 +-
 .../kernels/impl/segment_pool_kernel_impl.h   |   2 +-
 paddle/phi/kernels/impl/warpctc_kernel_impl.h |   2 +-
 .../phi/kernels/impl/warprnnt_kernel_impl.h   |   4 +-
 paddle/phi/kernels/is_empty_kernel.cc         |   2 +-
 paddle/phi/kernels/kps/elementwise_kernel.cu  |   2 +-
 paddle/phi/kernels/layer_norm_kernel.h        |   2 +-
 paddle/phi/kernels/memcpy_kernel.cc           |   4 +-
 paddle/phi/kernels/npu_identity_kernel.cc     |   2 +-
 paddle/phi/kernels/prod_kernel.cc             |   2 +-
 paddle/phi/kernels/reduce_all_kernel.cc       |   2 +-
 paddle/phi/kernels/reduce_amax_kernel.cc      |   2 +-
 paddle/phi/kernels/reduce_amin_kernel.cc      |   2 +-
 paddle/phi/kernels/reduce_any_kernel.cc       |   2 +-
 paddle/phi/kernels/reduce_mean_kernel.cc      |   2 +-
 paddle/phi/kernels/reduce_sum_kernel.cc       |   2 +-
 paddle/phi/kernels/reverse_kernel.cc          |   2 +-
 .../selected_rows/activation_kernel.cc        |   2 +-
 .../kernels/selected_rows/assign_kernel.cc    |   2 +-
 .../elementwise_multiply_kernel.cc            |   2 +-
 .../phi/kernels/selected_rows/full_kernel.cc  |   4 +-
 .../kernels/selected_rows/isfinite_kernel.cc  |   4 +-
 .../merge_selected_rows_kernel.cc             |   2 +-
 .../phi/kernels/selected_rows/scale_kernel.cc |   2 +-
 .../phi/kernels/selected_rows/shape_kernel.cc |   2 +-
 .../kernels/selected_rows/uniform_kernel.cc   |   2 +-
 paddle/phi/kernels/shape_kernel.cc            |   2 +-
 paddle/phi/kernels/sparse/empty_kernel.cc     |   2 +-
 .../sparse/sparse_utils_grad_kernel.cc        |   2 +-
 paddle/phi/kernels/squeeze_grad_kernel.cc     |   2 +-
 paddle/phi/kernels/squeeze_kernel.cc          |   2 +-
 .../phi/kernels/strided_slice_grad_kernel.cc  |   2 +-
 paddle/phi/kernels/strided_slice_kernel.cc    |   2 +-
 paddle/phi/kernels/strings/gpu/copy_utils.h   |   4 +-
 .../kernels/strings/strings_empty_kernel.cc   |   2 +-
 paddle/phi/kernels/strings/unicode.cc         |   2 +-
 paddle/phi/kernels/strings/unicode.h          |   2 +-
 paddle/phi/kernels/transfer_layout_kernel.cc  |   4 +-
 paddle/phi/kernels/unsqueeze_grad_kernel.cc   |   2 +-
 paddle/phi/kernels/unsqueeze_kernel.cc        |   2 +-
 paddle/testing/paddle_gtest_main.cc           |   4 +-
 312 files changed, 1459 insertions(+), 614 deletions(-)
 create mode 100644 cmake/mccl.cmake
 create mode 100644 cmake/mudnn.cmake
 create mode 100644 cmake/musa.cmake
 create mode 100644 paddle/phi/backends/gpu/musa/musa_info.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 632cf33100c7e..2f05a7eb080fa 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -56,6 +56,7 @@ option(WITH_XPU_XFT "Compile PaddlePaddle with BAIDU XPU-XFT" OFF)
 option(WITH_XPU_PLUGIN "Compile PaddlePaddle with BAIDU XPU plugin" OFF)
 option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode" OFF)
 option(WITH_ROCM "Compile PaddlePaddle with ROCM platform" OFF)
+option(WITH_MUSA "Compile PaddlePaddle with MUSA platform" OFF)
 option(WITH_IPU "Compile PaddlePaddle with Graphcore IPU" OFF)
 option(WITH_ONNXRUNTIME "Compile PaddlePaddle with ONNXRUNTIME" OFF)
 option(WITH_CUSPARSELT "Compile PaddlePaddle with CUSPARSELT" OFF)
@@ -89,6 +90,9 @@ endif()
 if(WITH_GPU AND WITH_ROCM)
   message(FATAL_ERROR "Error when compile CUDA and ROCM at the same time")
 endif()
+if(WITH_GPU AND WITH_MUSA)
+	message(FATAL_ERROR "Error when compile CUDA and MUSA at the same time")
+endif()
 
 if(WITH_GPU AND NOT APPLE)
   enable_language(CUDA)
@@ -346,6 +350,7 @@ if(LINUX
    AND NOT WITH_CUSTOM_DEVICE
    AND NOT WITH_GPU
    AND NOT WITH_ROCM
+   AND NOT WITH_MUSA
    AND NOT WITH_XPU
    AND NOT WITH_XPU_KP
    AND NOT WITH_XPU_XFT
@@ -503,6 +508,31 @@ else()
   endif()
 endif()
 
+if(WITH_MUSA)
+  include(musa)
+  include(mudnn)
+endif()
+
+if(NOT WITH_MUSA AND WITH_MCCL)
+  message(
+    WARNING "Disable MCCL when compiling without MUSA. Force WITH_MCCL=OFF.")
+  set(WITH_MCCL
+      OFF
+      CACHE STRING "Disable MCCL when compiling without MUSA" FORCE)
+endif()
+
+if(WITH_MCCL)
+  add_definitions("-DPADDLE_WITH_MCCL")
+  include(mccl)
+else()
+  if(WITH_MUSA)
+    message(
+      WARNING
+        "If the environment is multi-card, the WITH_MCCL option needs to be turned on, otherwise only a single card can be used."
+    )
+  endif()
+endif()
+
 if(WITH_HETERPS AND WITH_PSLIB)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
 endif()
@@ -733,6 +763,7 @@ if(WITH_CPP_DIST)
   endif()
 endif()
 
+include_directories(/usr/lib/llvm-11/include/openmp/)
 add_subdirectory(paddle)
 if(WITH_PYTHON)
   add_subdirectory(python)
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index a32dea08e5bff..3e95ed25ce473 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -141,15 +141,11 @@ if(NOT WIN32)
   set(COMMON_FLAGS
       -fPIC
       -fno-omit-frame-pointer
-      -Werror
-      -Wall
-      -Wextra
       -Wno-unused-parameter
       -Wno-unused-function
       -Wno-error=array-bounds #Warning in Eigen, gcc 12.2
       -Wno-error=ignored-attributes # Warnings in Eigen, gcc 6.3
       -Wno-error=int-in-bool-context # Warning in Eigen gcc 7.2
-      -Wimplicit-fallthrough=0 # Warning in tinyformat.h
       ${fsanitize})
 
   if(WITH_IPU)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 947d44950d52b..28aecb580a637 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -425,6 +425,9 @@ function(cc_binary TARGET_NAME)
   if(WITH_ROCM)
     target_link_libraries(${TARGET_NAME} ${ROCM_HIPRTC_LIB})
   endif()
+  #if(WITH_MUSA)
+  #  target_link_libraries(${TARGET_NAME} ${MUSA_LIB})
+  #endif()
 
   check_coverage_opt(${TARGET_NAME} ${cc_binary_SRCS})
 
@@ -775,6 +778,71 @@ function(hip_test TARGET_NAME)
   endif()
 endfunction()
 
+function(musa_library TARGET_NAME)
+  if(WITH_MUSA)
+    set(options STATIC static SHARED shared)
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(musa_library "${options}" "${oneValueArgs}"
+                          "${multiValueArgs}" ${ARGN})
+    if(musa_library_SRCS)
+      # TODO(MTAI): enable compiling static library
+      #if(musa_library_SHARED OR musa_library_shared) # build *.so
+      #  musa_add_library(${TARGET_NAME} SHARED ${musa_library_SRCS})
+      #else()
+      #  musa_add_library(${TARGET_NAME} STATIC ${musa_library_SRCS})
+      #  find_fluid_modules(${TARGET_NAME})
+      #  find_phi_modules(${TARGET_NAME})
+      #endif()
+      musa_add_library(${TARGET_NAME} SHARED ${hip_library_SRCS})
+      if(musa_library_DEPS)
+        add_dependencies(${TARGET_NAME} ${musa_library_DEPS})
+        target_link_libraries(${TARGET_NAME} ${musa_library_DEPS})
+      endif()
+      # cpplint code style
+      foreach(source_file ${musa_library_SRCS})
+        string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
+        if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+          list(APPEND musa_library_HEADERS
+               ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+        endif()
+      endforeach()
+    else()
+      if(musa_library_DEPS)
+        list(REMOVE_DUPLICATES musa_library_DEPS)
+        generate_dummy_static_lib(
+          LIB_NAME ${TARGET_NAME} FILE_PATH ${target_SRCS} GENERATOR
+          "generic.cmake:musa_library")
+
+        target_link_libraries(${TARGET_NAME} ${musa_library_DEPS})
+        add_dependencies(${TARGET_NAME} ${musa_library_DEPS})
+      else()
+        message(FATAL "Please specify source file or library in musa_library.")
+      endif()
+    endif()
+  endif()
+endfunction()
+
+function(musa_binary TARGET_NAME)
+  if(WITH_MUSA)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(musa_binary "${options}" "${oneValueArgs}"
+                          "${multiValueArgs}" ${ARGN})
+    musa_add_executable(${TARGET_NAME} ${musa_binary_SRCS})
+    if(musa_binary_DEPS)
+      target_link_libraries(${TARGET_NAME} ${musa_binary_DEPS})
+      add_dependencies(${TARGET_NAME} ${musa_binary_DEPS})
+      common_link(${TARGET_NAME})
+    endif()
+  endif()
+endfunction()
+
+# TODO(MTAI): enable musa_test
+#function(musa_test TARGET_NAME)
+#endfunction()
+
 function(xpu_library TARGET_NAME)
   if(WITH_XPU_KP)
     set(options STATIC static SHARED shared)
diff --git a/cmake/mccl.cmake b/cmake/mccl.cmake
new file mode 100644
index 0000000000000..12191a2711d46
--- /dev/null
+++ b/cmake/mccl.cmake
@@ -0,0 +1,31 @@
+if(NOT WITH_MUSA)
+  return()
+endif()
+
+# Now we don't support MCCL on windows
+if(WIN32)
+  return()
+endif()
+
+# FIXME(MTAI): please make sure that we can find MCCL successfully
+if(WITH_MCCL)
+  set(MCCL_ROOT
+      ${MUSA_PATH}/mccl
+      CACHE PATH "MCCL ROOT")
+  find_path(
+    MCCL_INCLUDE_DIR mccl.h
+    PATHS ${MCCL_ROOT} ${MCCL_ROOT}/include ${MCCL_ROOT}/local/include
+          $ENV{MCCL_ROOT} $ENV{MCCL_ROOT}/include $ENV{MCCL_ROOT}/local/include
+    NO_DEFAULT_PATH)
+
+  file(READ ${MCCL_INCLUDE_DIR}/mccl.h MCCL_VERSION_FILE_CONTENTS)
+
+  string(REGEX MATCH "define NCCL_VERSION_CODE +([0-9]+)" MCCL_VERSION
+               "${MCCL_VERSION_FILE_CONTENTS}")
+  string(REGEX REPLACE "define NCCL_VERSION_CODE +([0-9]+)" "\\1" MCCL_VERSION
+                       "${MCCL_VERSION}")
+
+  message(STATUS "Current MCCL header is ${MCCL_INCLUDE_DIR}/mccl.h. "
+                 "Current MCCL version is v${MCCL_VERSION}. ")
+endif()
+
diff --git a/cmake/mudnn.cmake b/cmake/mudnn.cmake
new file mode 100644
index 0000000000000..80c74c9131c21
--- /dev/null
+++ b/cmake/mudnn.cmake
@@ -0,0 +1,66 @@
+if(NOT WITH_MUSA)
+  return()
+endif()
+
+if(WIN32)
+  return()
+endif()
+
+find_path(
+  MUDNN_INCLUDE_DIR mudnn.h
+  PATHS ${MUDNN_ROOT} ${MUDNN_ROOT}/include $ENV{MUDNN_ROOT}
+        $ENV{MUDNN_ROOT}/include ${MUSA_TOOLKIT_INCLUDE}
+  NO_DEFAULT_PATH)
+
+get_filename_component(__libpath_hist ${MUSA_MUSART_LIBRARY} PATH)
+
+set(TARGET_ARCH "x86_64")
+if(NOT ${CMAKE_SYSTEM_PROCESSOR})
+  set(TARGET_ARCH ${CMAKE_SYSTEM_PROCESSOR})
+endif()
+
+list(
+  APPEND
+  MUDNN_CHECK_LIBRARY_DIRS
+  ${MUDNN_ROOT}
+  ${MUDNN_ROOT}/lib64
+  ${MUDNN_ROOT}/lib
+  ${MUDNN_ROOT}/lib/x64
+  ${MUDNN_ROOT}/lib/${TARGET_ARCH}-linux-gnu
+  ${MUDNN_ROOT}/local/cuda-${MUSA_VERSION}/targets/${TARGET_ARCH}-linux/lib/
+  $ENV{MUDNN_ROOT}
+  $ENV{MUDNN_ROOT}/lib64
+  $ENV{MUDNN_ROOT}/lib
+  $ENV{MUDNN_ROOT}/lib/x64
+  /usr/lib
+  ${MUSA_TOOLKIT_ROOT_DIR}
+  ${MUSA_TOOLKIT_ROOT_DIR}/lib/x64)
+set(MUDNN_LIB_NAME "")
+
+if(LINUX)
+  set(MUDNN_LIB_NAME "libmudnn.so")
+endif()
+
+find_library(
+  MUDNN_LIBRARY
+  NAMES ${MUDNN_LIB_NAME}
+  PATHS ${MUDNN_CHECK_LIBRARY_DIRS} ${MUDNN_INCLUDE_DIR} ${__libpath_hist}
+  NO_DEFAULT_PATH
+  DOC "Path to muDNN library.")
+
+if(MUDNN_INCLUDE_DIR AND MUDNN_LIBRARY)
+  set(MUDNN_FOUND ON)
+else()
+  set(MUDNN_FOUND OFF)
+endif()
+
+#macro(find_cudnn_version cudnn_header_file)
+#endmacro()
+
+#if(MUDNN_FOUND)
+#  find_mudnn_version(${MUDNN_INCLUDE_DIR}/mudnn.h)
+#  if(NOT MUDNN_MAJOR_VERSION)
+#    find_mudnn_version(${MUDNN_INCLUDE_DIR}/mudnn_version.h)
+#  endif()
+#endif()
+
diff --git a/cmake/musa.cmake b/cmake/musa.cmake
new file mode 100644
index 0000000000000..39245d726d4f9
--- /dev/null
+++ b/cmake/musa.cmake
@@ -0,0 +1,33 @@
+if(NOT WITH_MUSA)
+  return()
+endif()
+
+if(NOT DEFINED ENV{MUSA_PATH})
+  set(MUSA_PATH
+      "/usr/local/musa"
+      CACHE PATH "Path to which ROCm has been installed")
+else()
+  set(MUSA_PATH
+      $ENV{MUSA_PATH}
+      CACHE PATH "Path to which ROCm has been installed")
+endif()
+set(CMAKE_MODULE_PATH "${MUSA_PATH}/cmake" ${CMAKE_MODULE_PATH})
+
+find_package(MUSA REQUIRED)
+include_directories(${MUSA_PATH}/include)
+
+#macro(find_musa_version version_file)
+#endmacro()
+#find_musa_version(${MUSA_PATH}/version.h)
+
+if(WITH_CINN)
+  list(APPEND MUSA_MCC_FLAGS -std=c++14)
+else()
+  list(APPEND MUSA_MCC_FLAGS -std=c++17)
+endif()
+
+set(MUSA_VERBOSE_BUILD ON)
+if(CMAKE_BUILD_TYPE MATCHES Debug)
+  list(APPEND MUSA_MCC_FLAGS -g2)
+  list(APPEND MUSA_MCC_FLAGS -O0)
+endif()
diff --git a/paddle/fluid/distributed/fleet_executor/carrier.cc b/paddle/fluid/distributed/fleet_executor/carrier.cc
index 82d99a3835230..dc92bb8f699d6 100644
--- a/paddle/fluid/distributed/fleet_executor/carrier.cc
+++ b/paddle/fluid/distributed/fleet_executor/carrier.cc
@@ -272,7 +272,7 @@ static std::shared_ptr<framework::GarbageCollector> GetGC(
   int64_t max_memory_size = framework::GetEagerDeletionThreshold();
   std::shared_ptr<framework::GarbageCollector> gc;
   if (max_memory_size >= 0) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     if (platform::is_gpu_place(place)) {
       if (framework::IsFastEagerDeletionModeEnabled()) {
         gc.reset(new framework::UnsafeFastGPUGarbageCollector(place,
diff --git a/paddle/fluid/distributed/fleet_executor/cond_interceptor.cc b/paddle/fluid/distributed/fleet_executor/cond_interceptor.cc
index 2e3389af5feb5..df284822390d0 100644
--- a/paddle/fluid/distributed/fleet_executor/cond_interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/cond_interceptor.cc
@@ -71,7 +71,7 @@ bool CondInterceptor::GetCondResult() {
   const auto& cond_tensor = cond_var->Get<phi::DenseTensor>();
   bool res = false;
   if (platform::is_gpu_place(cond_tensor.place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     phi::DenseTensor cpu_tensor;
     framework::TensorCopy(cond_tensor, platform::CPUPlace(), &cpu_tensor);
     platform::DeviceContextPool::Instance().Get(cond_tensor.place())->Wait();
diff --git a/paddle/fluid/distributed/fleet_executor/dist_model.cc b/paddle/fluid/distributed/fleet_executor/dist_model.cc
index 4836d656d180f..4328941d60a65 100644
--- a/paddle/fluid/distributed/fleet_executor/dist_model.cc
+++ b/paddle/fluid/distributed/fleet_executor/dist_model.cc
@@ -76,7 +76,7 @@ bool LoadDataFromDistModelTensor(const DistModelTensor &input_data,
                 input_data.data.length());
   } else if (platform::is_gpu_place(place)) {
     VLOG(3) << "Loading data for GPU.";
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto *dev_ctx = dynamic_cast<const phi::GPUContext *>(pool.Get(place));
     auto gpu_place = place;
diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
index 7567236c4ff68..7050947466d23 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
@@ -132,7 +132,7 @@ void ScaleAPI(const paddle::Tensor& x,
                                          bias_after_scale,
                                          dense_out.get());
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   } else if (expected_kernel_place == paddle::platform::CUDAPlace()) {
     auto* dev_ctx =
         dynamic_cast<phi::GPUContext*>(pool.Get(expected_kernel_place));
diff --git a/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py
index 7fe53febc5a9b..b96b997976be4 100644
--- a/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py
@@ -124,7 +124,7 @@ def FindParsingFunctionFromAttributeType(atype):
 FUNCTION_SET_DEVICE_TEMPLATE = """{}
     SetPythonStack();
     if (paddle::platform::is_gpu_place(place)) {{
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       phi::backends::gpu::SetDeviceId(place.device);
       VLOG(4) <<"CurrentDeviceId: " << phi::backends::gpu::GetCurrentDeviceId() << " from " << (int)place.device;
 #else
diff --git a/paddle/fluid/eager/nan_inf_utils.cc b/paddle/fluid/eager/nan_inf_utils.cc
index a66bc211d513c..e3e5968426462 100644
--- a/paddle/fluid/eager/nan_inf_utils.cc
+++ b/paddle/fluid/eager/nan_inf_utils.cc
@@ -98,7 +98,7 @@ void CheckTensorHasNanOrInf(const std::string& api_name, const Tensor& tensor) {
 
     auto& place = dense_tensor->place();
     if (paddle::platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       paddle::framework::details::tensor_check<phi::GPUContext>(
           api_name, tensor_name, *dense_tensor, place);
 #else
diff --git a/paddle/fluid/framework/conv_search_cache.h b/paddle/fluid/framework/conv_search_cache.h
index 1620c99ce8560..cbac8cac4e543 100644
--- a/paddle/fluid/framework/conv_search_cache.h
+++ b/paddle/fluid/framework/conv_search_cache.h
@@ -32,7 +32,7 @@ class ConvSearchCache {
     static ConvSearchCache instance;
     return instance;
   }
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
   AlgorithmsCache<miopenConvFwdAlgorithm_t>* GetForward() {
     return &forward_cache_;
   }
@@ -45,6 +45,8 @@ class ConvSearchCache {
   AlgorithmsCache<miopenConvFwdAlgorithm_t>* GetConvFusion() {
     return &fusion_forward_cache_;
   }
+#elif defined(PADDLE_WITH_MUSA)
+
 #else
   AlgorithmsCache<cudnnConvolutionFwdAlgo_t>* GetForward() {
     return &forward_cache_;
@@ -67,11 +69,13 @@ class ConvSearchCache {
   ConvSearchCache(const ConvSearchCache&) {}
   ConvSearchCache& operator=(const ConvSearchCache&) {}
 
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
   AlgorithmsCache<miopenConvFwdAlgorithm_t> forward_cache_;
   AlgorithmsCache<miopenConvBwdDataAlgorithm_t> backward_data_cache_;
   AlgorithmsCache<miopenConvBwdWeightsAlgorithm_t> backward_filter_cache_;
   AlgorithmsCache<miopenConvFwdAlgorithm_t> fusion_forward_cache_;
+#elif defined(PADDLE_WITH_MUSA)
+
 #else
   AlgorithmsCache<cudnnConvolutionFwdAlgo_t> forward_cache_;
   AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t> backward_data_cache_;
diff --git a/paddle/fluid/framework/copy_same_tensor_test.cc b/paddle/fluid/framework/copy_same_tensor_test.cc
index 10e0b76f00459..3bc05d706ade9 100644
--- a/paddle/fluid/framework/copy_same_tensor_test.cc
+++ b/paddle/fluid/framework/copy_same_tensor_test.cc
@@ -32,7 +32,7 @@ namespace framework {
 static std::vector<platform::Place> CreatePlaceList() {
   std::vector<platform::Place> places;
   places.emplace_back(platform::CPUPlace());
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   places.emplace_back(platform::CUDAPlace(0));
 #endif
   return places;
diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index ebfed9a6f73f6..64f6214fca0c9 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -123,7 +123,7 @@ static void RunKernelFunc(
                 "Input tensor (%s) is not initialized.", in_name));
         paddle::Tensor custom_in;
         custom_in.set_impl(std::make_shared<phi::DenseTensor>(*x));
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
         if (custom_in.is_gpu_pinned()) {
           VLOG(3) << "Custom Operator: custom input is gpu pinned tensor";
           auto gpu_place = phi::GPUPlace(platform::GetCurrentDeviceId());
@@ -1174,7 +1174,7 @@ static void RegisterOperatorKernel(
   }
   RegisterOperatorKernelWithPlace(
       name, op_kernel_func, proto::VarType::RAW, platform::CPUPlace());
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   RegisterOperatorKernelWithPlace(
       name, op_kernel_func, proto::VarType::RAW, platform::CUDAPlace());
 #endif
diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc
index 32c4845bd0d57..d99e7739e8e39 100644
--- a/paddle/fluid/framework/data_feed.cc
+++ b/paddle/fluid/framework/data_feed.cc
@@ -1526,7 +1526,7 @@ void MultiSlotInMemoryDataFeed::PutToFeedVec(
 #endif
 }
 
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && !defined(_WIN32)
 template <typename T>
 void PrivateInstantDataFeed<T>::PutToFeedVec() {
   for (size_t i = 0; i < use_slots_.size(); ++i) {
diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h
index 1057640842c2c..875b8ca13da83 100644
--- a/paddle/fluid/framework/data_feed.h
+++ b/paddle/fluid/framework/data_feed.h
@@ -1951,7 +1951,7 @@ class PaddleBoxDataFeed : public MultiSlotInMemoryDataFeed {
   int pv_batch_size_;
 };
 
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && !defined(_WIN32)
 template <typename T>
 class PrivateInstantDataFeed : public DataFeed {
  public:
diff --git a/paddle/fluid/framework/data_feed_factory.cc b/paddle/fluid/framework/data_feed_factory.cc
index e058b19469000..368807f72dfc4 100644
--- a/paddle/fluid/framework/data_feed_factory.cc
+++ b/paddle/fluid/framework/data_feed_factory.cc
@@ -70,7 +70,7 @@ REGISTER_DATAFEED_CLASS(MultiSlotDataFeed);
 REGISTER_DATAFEED_CLASS(MultiSlotInMemoryDataFeed);
 REGISTER_DATAFEED_CLASS(PaddleBoxDataFeed);
 REGISTER_DATAFEED_CLASS(SlotRecordInMemoryDataFeed);
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && !defined(_WIN32)
 REGISTER_DATAFEED_CLASS(MultiSlotFileInstantDataFeed);
 #endif
 }  // namespace framework
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index 69f7a49ce55fd..dc66ca6922e35 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -186,7 +186,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
                         "fuse_relu_depthwise_conv_pass");
     AppendPassWithCheck(strategy_.fuse_bn_act_ops_, "fuse_bn_act_pass");
     AppendPassWithCheck(strategy_.fuse_bn_add_act_ops_, "fuse_bn_add_act_pass");
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && \
     !defined(_WIN32) && !defined(__APPLE__)
     AppendPassWithCheck(strategy_.enable_auto_fusion_, "fusion_group_pass");
 #endif
@@ -545,7 +545,7 @@ USE_PASS(fused_feedforward_pass);
 #ifdef PADDLE_WITH_MKLDNN
 USE_PASS(mkldnn_placement_pass);
 #endif
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && \
     !defined(_WIN32) && !defined(__APPLE__)
 USE_PASS(fusion_group_pass);
 #endif
diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
index 4012263f688cb..3e204548fa151 100644
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
@@ -16,7 +16,7 @@
 
 #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
 #include <algorithm>
@@ -44,15 +44,18 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
       place_(place),
       var_infos_(vars.begin(), vars.end()),
       gc_(gc) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_MUSA)
   if (platform::is_gpu_place(place)) {
     dev_ctx_ = reinterpret_cast<phi::GPUContext *>(
         platform::DeviceContextPool::Instance().Get(place));
     if (dynamic_cast<StreamGarbageCollector *>(gc_)) {
       platform::CUDADeviceGuard guard(place.device);
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipEventCreateWithFlags(&event_, hipEventDisableTiming));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          musaEventCreateWithFlags(&event_, musaEventDisableTiming));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
@@ -75,12 +78,14 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
 }
 
 EagerDeletionOpHandle::~EagerDeletionOpHandle() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (event_) {
     auto gpu_place = dev_ctx_->GetPlace();
     platform::CUDADeviceGuard guard(gpu_place.device);
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event_));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaEventDestroy(event_));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(event_));
 #endif
@@ -89,7 +94,7 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() {
 }
 
 void EagerDeletionOpHandle::InitCUDA() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_MUSA)
   int dev_id = dev_ctxes_.begin()->first.device;
   events_[dev_id] = nullptr;
 #endif
@@ -177,16 +182,20 @@ void EagerDeletionOpHandle::RunImpl() {
 
 void EagerDeletionOpHandle::ClearGarbages(
     std::deque<std::shared_ptr<memory::Allocation>> *garbages) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_MUSA)
   if (event_) {
     auto compute_stream = dev_ctx_->stream();
     auto callback_stream =
         reinterpret_cast<StreamGarbageCollector *>(gc_)->stream();
     auto callback_func = [=]() {
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
       PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, compute_stream));
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipStreamWaitEvent(callback_stream, event_, 0));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event_, compute_stream));
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          musaStreamWaitEvent(callback_stream, event_, 0));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, compute_stream));
       PADDLE_ENFORCE_GPU_SUCCESS(
@@ -197,7 +206,7 @@ void EagerDeletionOpHandle::ClearGarbages(
   } else {
 #endif
     gc_->Add(std::move(*garbages));
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   }
 #endif
 }
diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.h b/paddle/fluid/framework/details/eager_deletion_op_handle.h
index 0a92269c50ad2..049b0c2ec478b 100644
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.h
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.h
@@ -80,7 +80,7 @@ class EagerDeletionOpHandle : public OpHandleBase {
   std::vector<ir::MemOptVarInfo *> var_infos_;  // not own
   GarbageCollector *gc_;                        // not own
   std::vector<Variable *> vars_;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   phi::GPUContext *dev_ctx_{nullptr};
   gpuEvent_t event_{nullptr};
 #endif
diff --git a/paddle/fluid/framework/details/fetch_async_op_handle.cc b/paddle/fluid/framework/details/fetch_async_op_handle.cc
index 9fd6a08e02302..d96ab68ec823c 100644
--- a/paddle/fluid/framework/details/fetch_async_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_async_op_handle.cc
@@ -135,7 +135,7 @@ static void TransData(const phi::DenseTensor *src_item,
                       const platform::DeviceContext &ctx) {
   if (src_item->IsInitialized() && src_item->numel() > 0) {
     if (platform::is_gpu_place(src_item->place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       TensorCopy(*src_item, platform::CUDAPinnedPlace(), ctx, dst_item);
 #endif
     } else {
diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
index a36b63da9b8b6..9ea280a8d8bc5 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -121,7 +121,7 @@ static void TransData(const phi::DenseTensor &src_item,
                       phi::DenseTensor *dst_item) {
   if (src_item.IsInitialized() && src_item.numel() > 0) {
     if (platform::is_gpu_place(src_item.place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       TensorCopy(src_item, platform::CPUPlace(), dst_item);
 #endif
     } else {
diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
index 29d5697b23f0d..b07211a6b18d7 100644
--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
@@ -32,7 +32,7 @@ typedef std::vector<
     std::vector<std::pair<std::string, const phi::DenseTensor *>>>
     GradientAndLoDTensor;
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 FusedAllReduceOpHandle::FusedAllReduceOpHandle(
     ir::Node *node,
     const std::vector<Scope *> &local_scopes,
@@ -61,11 +61,13 @@ FusedAllReduceOpHandle::FusedAllReduceOpHandle(
 #endif
 
 FusedAllReduceOpHandle::~FusedAllReduceOpHandle() {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   auto destroy_event = [](gpuEvent_t event) {
     if (event == nullptr) return;
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaEventDestroy(event));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(event));
 #endif
@@ -100,9 +102,12 @@ void FusedAllReduceOpHandle::RunImpl() {
                           "when using GPU device."));
     auto create_event = [](gpuEvent_t *event) {
       if (*event) return;
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipEventCreateWithFlags(event, hipEventDisableTiming));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          musaEventCreateWithFlags(event, musaEventDisableTiming));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaEventCreateWithFlags(event, cudaEventDisableTiming));
@@ -122,10 +127,14 @@ void FusedAllReduceOpHandle::RunImpl() {
     auto flat_nccl_ctxs = nccl_ctxs_->GetFlatCtx(run_order_);
     auto &nccl_ctx = flat_nccl_ctxs->at(gpu_place.device);
     nccl_stream = nccl_ctx.stream();
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(start_event_, compute_stream));
     PADDLE_ENFORCE_GPU_SUCCESS(
         hipStreamWaitEvent(nccl_stream, start_event_, 0));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(start_event_, compute_stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        musaStreamWaitEvent(nccl_stream, start_event_, 0));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(start_event_, compute_stream));
     PADDLE_ENFORCE_GPU_SUCCESS(
@@ -185,12 +194,16 @@ void FusedAllReduceOpHandle::RunImpl() {
     FusedAllReduceFunc(in_var_handles, out_var_handles);
   }
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   if (FLAGS_allreduce_record_one_event) {
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(end_event_, nccl_stream));
     PADDLE_ENFORCE_GPU_SUCCESS(
         hipStreamWaitEvent(compute_stream, end_event_, 0));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(end_event_, nccl_stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        musaStreamWaitEvent(compute_stream, end_event_, 0));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(end_event_, nccl_stream));
     PADDLE_ENFORCE_GPU_SUCCESS(
diff --git a/paddle/fluid/framework/details/gather_op_handle_test.cc b/paddle/fluid/framework/details/gather_op_handle_test.cc
index 3437eb5570dc7..455879f02e833 100644
--- a/paddle/fluid/framework/details/gather_op_handle_test.cc
+++ b/paddle/fluid/framework/details/gather_op_handle_test.cc
@@ -47,7 +47,7 @@ struct TestGatherOpHandle {
 
   void InitCtxOnGpu(bool use_gpu) {
     if (use_gpu) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       int count = p::GetGPUDeviceCount();
       if (count <= 1) {
         LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
@@ -224,7 +224,7 @@ TEST(GatherTester, TestCPUGatherTestSelectedRows) {
   test_op.TestGatherSelectedRows(input_scope_idx);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 TEST(GatherTester, TestGPUGatherTestSelectedRows) {
   TestGatherOpHandle test_op;
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
index 80c029a5fd976..bc8a31a35a95c 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
@@ -183,7 +183,7 @@ void CheckVarHasNanOrInf(const std::string& op_type,
            << ", place:" << tensor->place() << ", numel:" << tensor->numel();
 
   if (platform::is_gpu_place(tensor->place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     tensor_check<phi::GPUContext>(op_type, var_name, *tensor, place);
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index 82f09f51c23e1..69fb0df678f65 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -31,11 +31,13 @@ std::string OpHandleBase::DebugString() const {
 }
 
 OpHandleBase::~OpHandleBase() PADDLE_MAY_THROW {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_MUSA)
   for (auto &ev : events_) {
     if (ev.second) {
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
       PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(ev.second));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(musaEventDestroy(ev.second));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(ev.second));
 #endif
@@ -45,13 +47,16 @@ OpHandleBase::~OpHandleBase() PADDLE_MAY_THROW {
 }
 
 void OpHandleBase::InitCUDA() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_MUSA)
   for (auto &p : dev_ctxes_) {
     int dev_id = p.first.device;
     platform::SetDeviceId(dev_id);
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
     PADDLE_ENFORCE_GPU_SUCCESS(
         hipEventCreateWithFlags(&events_[dev_id], hipEventDisableTiming));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        musaEventCreateWithFlags(&events_[dev_id], musaEventDisableTiming));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(
         cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming));
@@ -136,7 +141,7 @@ void OpHandleBase::InitXPU() {
 }
 
 void OpHandleBase::Run(DeviceType use_device) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_MUSA)
   if (events_.empty() && use_device == p::kCUDA && dev_ctxes_.size() > 0) {
     InitCUDA();
   }
@@ -172,7 +177,7 @@ void OpHandleBase::Run(DeviceType use_device) {
 }
 
 void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_MUSA)
   PADDLE_ENFORCE_NOT_NULL(
       waited_ctx,
       platform::errors::InvalidArgument("Argument waited_ctx is NULL."));
@@ -186,8 +191,10 @@ void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
   } else {
     auto stream = static_cast<phi::GPUContext *>(waited_ctx)->stream();
     for (auto &ev : events_) {
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
       PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(stream, ev.second, 0));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(stream, ev.second, 0));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(stream, ev.second, 0));
 #endif
@@ -221,12 +228,15 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) {
       if (in_var_handle) {
         auto &place = in_var_handle->place();
         if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_MUSA)
           auto stream =
               static_cast<phi::GPUContext *>(dev_ctxes_.at(place))->stream();
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
           PADDLE_ENFORCE_GPU_SUCCESS(
               hipStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
+#elif defined(PADDLE_WITH_MUSA)
+          PADDLE_ENFORCE_GPU_SUCCESS(
+              musaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
 #else
           PADDLE_ENFORCE_GPU_SUCCESS(
               cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
@@ -248,7 +258,7 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) {
         if (in_var_handle) {
           auto &place = in_var_handle->place();
           if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_MUSA)
             platform::DeviceContextPool &pool =
                 platform::DeviceContextPool::Instance();
             auto stream =
@@ -273,13 +283,16 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
       auto *in_var_handle = dynamic_cast<VarHandle *>(in_var);
       if (in_var_handle) {
         if (platform::is_gpu_place(in_var_handle->place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_MUSA)
           auto stream = static_cast<phi::GPUContext *>(
                             dev_ctxes_.at(in_var_handle->place()))
                             ->stream();
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
           PADDLE_ENFORCE_GPU_SUCCESS(
               hipStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
+#elif defined(PADDLE_WITH_MUSA)
+          PADDLE_ENFORCE_GPU_SUCCESS(
+              musaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
 #else
           PADDLE_ENFORCE_GPU_SUCCESS(
               cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
@@ -311,15 +324,18 @@ bool OpHandleBase::NeedWait(VarHandleBase *in_var) {
 
 void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
   callback();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_MUSA)
   if (!events_.empty()) {  // Use event
     for (auto &p : dev_ctxes_) {
       auto dev_id = p.first.device;
       auto *cuda_dev_ctx = static_cast<phi::GPUContext *>(p.second);
       VLOG(10) << "phi::GPUContext:" << cuda_dev_ctx << ", dev_id:" << dev_id;
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipEventRecord(events_.at(dev_id), cuda_dev_ctx->stream()));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          musaEventRecord(events_.at(dev_id), cuda_dev_ctx->stream()));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaEventRecord(events_.at(dev_id), cuda_dev_ctx->stream()));
@@ -331,7 +347,7 @@ void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
 
 void OpHandleBase::RunAndRecordEvent(platform::Place p,
                                      const std::function<void()> &callback) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_MUSA)
   if (platform::is_cpu_place(p) || events_.empty()) {
     callback();
   } else {
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index 9afe56e4babd4..4bd385ff5099c 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -161,7 +161,7 @@ class OpHandleBase {
   // See https://github.com/PaddlePaddle/Paddle/pull/32283
   bool is_variant_scope_ = false;
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   std::unordered_map<int, gpuEvent_t> events_;
 #endif
 
diff --git a/paddle/fluid/framework/details/reduce_op_handle_test.cc b/paddle/fluid/framework/details/reduce_op_handle_test.cc
index 7587fb6553cd7..205567a39ecd7 100644
--- a/paddle/fluid/framework/details/reduce_op_handle_test.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc
@@ -303,7 +303,7 @@ TEST(ReduceTester, TestCPUReduceTestLodTensor) {
   test_op.InitReduceOp(out_scope_idx);
   test_op.TestReduceLodTensors(out_scope_idx);
 }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 TEST(ReduceTester, TestGPUReduceTestSelectedRows) {
   TestReduceOpHandle test_op;
diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
index 9dac1a7203f8d..8b487b5a0bffb 100644
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@@ -76,7 +76,7 @@ struct ScaleLossGradFunctor {
           "Please recompile or reinstall Paddle with XPU support."));
 #endif
     } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       OutT cast_coeff = static_cast<OutT>(coeff_);
       auto stream = static_cast<phi::GPUContext *>(ctx_)->stream();
       memory::Copy(place_,
@@ -110,7 +110,7 @@ void ScaleLossGradOpHandle::RunOnVar(Variable *var, bool record_event) {
   auto *tensor = var->GetMutable<phi::DenseTensor>();
   tensor->Resize(phi::make_ddim({1}));
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   ScaleLossGradFunctor func(
       coeff_, tensor, place_, out_dtype_, this->dev_ctxes_.at(place_));
   if (record_event) {
diff --git a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
index 02a68fb697efb..cb16915316ecf 100644
--- a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
+++ b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
@@ -95,7 +95,7 @@ void ShareTensorBufferOpHandle::SetShareDimsAndDtype(
 }
 
 void ShareTensorBufferOpHandle::InitCUDA() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   int dev_id = dev_ctxes_.begin()->first.device;
   events_[dev_id] = nullptr;
 #endif
diff --git a/paddle/fluid/framework/details/var_handle.h b/paddle/fluid/framework/details/var_handle.h
index a6314220d5c26..9a130bea0d3a2 100644
--- a/paddle/fluid/framework/details/var_handle.h
+++ b/paddle/fluid/framework/details/var_handle.h
@@ -129,7 +129,7 @@ struct VarHandle : public VarHandleBase {
         name_(std::move(name)),
         place_(std::move(place)) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   bool HasEvent() { return has_event_; }
 
   const gpuEvent_t& GetEvent() {
@@ -154,7 +154,7 @@ struct VarHandle : public VarHandleBase {
   size_t scope_idx_;
   std::string name_;
   platform::Place place_;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   // Only when this event is triggered, var is generated.
   gpuEvent_t event_;
   bool has_event_{false};
diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h
index 25d29e469a498..1da0aae399c37 100644
--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@@ -84,11 +84,11 @@ class PullDenseWorker {
  public:
   virtual ~PullDenseWorker() {}
   virtual void Initialize(const TrainerDesc& param);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   void AddStream(const gpuStream_t stream) { copy_streams_.push_back(stream); }
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
     defined(PADDLE_WITH_XPU)
   void AddPlace(const paddle::platform::Place place) {
     places_.push_back(place);
@@ -154,7 +154,7 @@ class PullDenseWorker {
   float total_batch_num_ = 0;
   std::unordered_map<const Scope*, int> scope_to_thread_id_;
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   std::vector<gpuStream_t> copy_streams_;
 #endif
   std::vector<paddle::platform::Place> places_;
@@ -185,7 +185,7 @@ class DeviceWorker {
   virtual void ProduceTasks() {}
   virtual void GetXpuOpIndex() {}
   virtual void Schedule(int taskid UNUSED) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   virtual void SetStream(const gpuStream_t stream UNUSED) {}
   virtual void SetEvent(const gpuEvent_t event UNUSED) {}
 #endif
@@ -561,7 +561,7 @@ class PSGPUWorker : public HogwildWorker {
     new (&program_) ProgramDesc(main_program);
   }
   void ProduceTasks() override;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   virtual void SetStream(const gpuStream_t stream) { copy_stream_ = stream; }
   virtual void SetEvent(const gpuEvent_t event) { event_ = event; }
 #endif
@@ -629,7 +629,7 @@ class PSGPUWorker : public HogwildWorker {
   std::unordered_map<uint64_t, std::unordered_set<uint64_t>> feasign_set_;
   paddle::framework::Channel<std::shared_ptr<HeterTask>> pull_queue_;
   paddle::framework::Channel<std::shared_ptr<HeterTask>> push_queue_;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   gpuEvent_t event_;
   gpuStream_t copy_stream_;
 #endif
@@ -802,7 +802,7 @@ class HeterSectionWorker : public DeviceWorker {
   Scope* GetThreadScope() override { return minibatch_scope_; }
 
   // multi-stream
-  // #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  // #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   //  void SetStream(const gpuStream_t stream) override {}
   //  void SetEvent(const gpuEvent_t event) override {}
   // #endif
diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc
index 46b917cda740a..e5e8bae0bbd79 100644
--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -96,7 +96,7 @@ struct DLDeviceVisitor {
   }
 
   inline ::DLDevice operator()(const platform::CUDAPlace &place) const {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     ::DLDevice device;
     device.device_type = kDLGPU;
     device.device_id = place.device;
@@ -108,7 +108,7 @@ struct DLDeviceVisitor {
   }
 
   inline ::DLDevice operator()(const platform::CUDAPinnedPlace &place) const {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     ::DLDevice device;
     device.device_type = kDLCPUPinned;
     device.device_id = 0;
diff --git a/paddle/fluid/framework/dlpack_tensor_test.cc b/paddle/fluid/framework/dlpack_tensor_test.cc
index f6b28b0a22ebc..7bf07aac14127 100644
--- a/paddle/fluid/framework/dlpack_tensor_test.cc
+++ b/paddle/fluid/framework/dlpack_tensor_test.cc
@@ -108,7 +108,7 @@ void TestToDLManagedTensor(const platform::Place &place, uint16_t lanes) {
 
 template <typename T>
 void TestMainLoop() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   std::vector<platform::Place> places{platform::CPUPlace(),
                                       platform::CUDAPlace(0),
                                       platform::CUDAPinnedPlace()};
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index e0ad2255743c4..40606c4911649 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -492,7 +492,7 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
   std::unique_ptr<GarbageCollector> gc;
   if (!ctx->force_disable_gc_ && max_memory_size >= 0) {
     if (platform::is_gpu_place(place_)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       if (IsFastEagerDeletionModeEnabled()) {
         gc.reset(new UnsafeFastGPUGarbageCollector(place_, max_memory_size));
       } else {
diff --git a/paddle/fluid/framework/fleet/box_wrapper.cu b/paddle/fluid/framework/fleet/box_wrapper.cu
index 5f46906cf8e82..389b1f99eed72 100644
--- a/paddle/fluid/framework/fleet/box_wrapper.cu
+++ b/paddle/fluid/framework/fleet/box_wrapper.cu
@@ -156,11 +156,16 @@ void BoxWrapper::CopyForPull(const paddle::platform::Place& place,
                     ->stream();
   auto buf_value = memory::Alloc(place, values.size() * sizeof(float*));
   float** gpu_values = reinterpret_cast<float**>(buf_value->ptr());
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
   hipMemcpy(gpu_values,
             values.data(),
             values.size() * sizeof(float*),
             hipMemcpyHostToDevice);
+#elif defined(PADDLE_WITH_MUSA)
+  musaMemcpy(gpu_values,
+             values.data(),
+             values.size() * sizeof(float*),
+             musaMemcpyHostToDevice);
 #else
   cudaMemcpy(gpu_values,
              values.data(),
diff --git a/paddle/fluid/framework/fleet/box_wrapper.h b/paddle/fluid/framework/fleet/box_wrapper.h
index 9853c328cd14e..054298795305e 100644
--- a/paddle/fluid/framework/fleet/box_wrapper.h
+++ b/paddle/fluid/framework/fleet/box_wrapper.h
@@ -593,8 +593,11 @@ class BoxWrapper {
       auto* gpu_data = gpu_tensor.data<T>();
       auto len = gpu_tensor.numel();
       data->resize(len);
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
       hipMemcpy(data->data(), gpu_data, sizeof(T) * len, hipMemcpyDeviceToHost);
+#elif defined(PADDLE_WITH_HIP)
+      musaMemcpy(
+          data->data(), gpu_data, sizeof(T) * len, musaMemcpyDeviceToHost);
 #else
       cudaMemcpy(
           data->data(), gpu_data, sizeof(T) * len, cudaMemcpyDeviceToHost);
diff --git a/paddle/fluid/framework/fleet/box_wrapper_impl.h b/paddle/fluid/framework/fleet/box_wrapper_impl.h
index d72e418aadd3e..09696c824fbbd 100644
--- a/paddle/fluid/framework/fleet/box_wrapper_impl.h
+++ b/paddle/fluid/framework/fleet/box_wrapper_impl.h
@@ -44,7 +44,7 @@ void BoxWrapper::PullSparseCase(const paddle::platform::Place& place,
     PADDLE_THROW(platform::errors::Unimplemented(
         "Warning:: CPUPlace is not supported in PaddleBox now."));
   } else if (platform::is_gpu_place(place)) {
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && !defined(_WIN32)
     VLOG(3) << "Begin copy keys, key_num[" << total_length << "]";
     int device_id = place.GetDeviceId();
     phi::DenseTensor& total_keys_tensor = keys_tensor[device_id];
@@ -61,7 +61,7 @@ void BoxWrapper::PullSparseCase(const paddle::platform::Place& place,
         memory::Alloc(place, slot_lengths.size() * sizeof(int64_t));
     uint64_t** gpu_keys = reinterpret_cast<uint64_t**>(buf_key->ptr());
     int64_t* gpu_len = reinterpret_cast<int64_t*>(buf_length->ptr());
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
     hipMemcpy(gpu_keys,
               keys.data(),
               keys.size() * sizeof(uint64_t*),
@@ -70,6 +70,15 @@ void BoxWrapper::PullSparseCase(const paddle::platform::Place& place,
               slot_lengths_lod.data(),
               slot_lengths.size() * sizeof(int64_t),
               hipMemcpyHostToDevice);
+#elif defined(PADDLE_WITH_MUSA)
+    musaMemcpy(gpu_keys,
+               keys.data(),
+               keys.size() * sizeof(uint64_t*),
+               musaMemcpyHostToDevice);
+    musaMemcpy(gpu_len,
+               slot_lengths_lod.data(),
+               slot_lengths.size() * sizeof(int64_t),
+               musaMemcpyHostToDevice);
 #else
     cudaMemcpy(gpu_keys,
                keys.data(),
@@ -153,7 +162,7 @@ void BoxWrapper::PushSparseGradCase(
     PADDLE_THROW(platform::errors::Unimplemented(
         "Warning:: CPUPlace is not supported in PaddleBox now."));
   } else if (platform::is_gpu_place(place)) {
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && !defined(_WIN32)
     int device_id = place.GetDeviceId();
     phi::DenseTensor& cached_total_keys_tensor = keys_tensor[device_id];
     uint64_t* total_keys =
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc
index 05433c1014656..f2c6892c6cd11 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.cc
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc
@@ -784,7 +784,7 @@ void FleetWrapper::PushDenseVarsSync(
     const uint64_t table_id,
     const std::vector<std::string>& var_names) {}
 
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && \
     (defined PADDLE_WITH_PSLIB)
 void FleetWrapper::PushDenseVarsAsync(
     const Scope& scope,
@@ -813,9 +813,12 @@ void FleetWrapper::PushDenseVarsAsync(
                  g_data,
                  sizeof(float) * count,
                  stream);
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, stream));
     hipEventSynchronize(event);
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event, stream));
+    musaEventSynchronize(event);
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, stream));
     cudaEventSynchronize(event);
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h
index fb5cf91729256..1284b379c9f20 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.h
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.h
@@ -175,7 +175,7 @@ class FleetWrapper {
 // Push dense variables to server in async mode
 // Param<in>: scope, table_id, var_names, scale_datanorm, batch_size
 // Param<out>: push_sparse_status
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   void PushDenseVarsAsync(
       const Scope& scope,
       const uint64_t table_id,
diff --git a/paddle/fluid/framework/fleet/heter_wrapper.cc b/paddle/fluid/framework/fleet/heter_wrapper.cc
index 2cae0721aefa9..761ef1cf8051a 100644
--- a/paddle/fluid/framework/fleet/heter_wrapper.cc
+++ b/paddle/fluid/framework/fleet/heter_wrapper.cc
@@ -121,7 +121,7 @@ void HeterWrapper::SerializeToReq(const std::string& varname,
            tensor->numel() *
                SizeOfType(framework::TransToProtoVarType(tensor->dtype())));
   } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     memory::Copy(platform::CPUPlace(),
                  data_ptr,
                  tensor->place(),
@@ -141,7 +141,7 @@ void HeterWrapper::SerializeToReq(const std::string& varname,
   }
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 void HeterWrapper::DeSerializeToTensor(Scope* scope,
                                        const VariableMessage& req_var,
                                        platform::Place place,
@@ -169,7 +169,7 @@ void HeterWrapper::DeSerializeToTensor(Scope* scope,
   void* tensor_data = tensor->mutable_data(
       place, framework::TransToPhiDataType(ToVarType(req_var.data_type())));
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   memory::Copy(place,
                tensor_data,
                platform::CPUPlace(),
diff --git a/paddle/fluid/framework/fleet/heter_wrapper.h b/paddle/fluid/framework/fleet/heter_wrapper.h
index 77838fbec6d00..70cbce2acc24d 100644
--- a/paddle/fluid/framework/fleet/heter_wrapper.h
+++ b/paddle/fluid/framework/fleet/heter_wrapper.h
@@ -92,7 +92,7 @@ class HeterWrapper {
 
   framework::proto::VarType::Type ToVarType(VariableMessage::Type type);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   void DeSerializeToTensor(Scope* scope,
                            const VariableMessage& req_var,
                            platform::Place place,
diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc
index 3296679e1eeeb..1d3937ba2b982 100644
--- a/paddle/fluid/framework/garbage_collector.cc
+++ b/paddle/fluid/framework/garbage_collector.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include <functional>
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
 #include "gflags/gflags.h"
@@ -64,7 +64,7 @@ void IPUGarbageCollector::ClearCallback(const std::function<void()> &callback) {
 }
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 UnsafeFastGPUGarbageCollector::UnsafeFastGPUGarbageCollector(
     const platform::CUDAPlace &place, size_t max_memory_size)
     : GarbageCollector(place, max_memory_size) {}
@@ -91,8 +91,10 @@ StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place,
                                                size_t max_memory_size)
     : GarbageCollector(place, max_memory_size) {
   platform::CUDADeviceGuard guard(place.device);
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
   PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&stream_));
+#elif defined(PADDLE_WITH_MUSA)
+  PADDLE_ENFORCE_GPU_SUCCESS(musaStreamCreate(&stream_));
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&stream_));
   callback_manager_.reset(
diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h
index f3d9ec54e6968..9727654d04c84 100644
--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@@ -85,7 +85,7 @@ class IPUGarbageCollector : public GarbageCollector {
 };
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 class UnsafeFastGPUGarbageCollector : public GarbageCollector {
  public:
   UnsafeFastGPUGarbageCollector(const platform::CUDAPlace &place,
diff --git a/paddle/fluid/framework/ir/cost_model.cc b/paddle/fluid/framework/ir/cost_model.cc
index 9ca3190fd092f..9ac931f2501a7 100644
--- a/paddle/fluid/framework/ir/cost_model.cc
+++ b/paddle/fluid/framework/ir/cost_model.cc
@@ -128,7 +128,7 @@ bool CostData::SetCostData(const ProgramDesc& program,
     double cpu_time_ms = main_thread_events[op_push_index].CpuElapsedMs(
         main_thread_events[op_pop_index]);
     double gpu_time_ms = 0;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     gpu_time_ms = main_thread_events[op_push_index].CudaElapsedMs(
         main_thread_events[op_pop_index]);
 #endif
@@ -152,7 +152,7 @@ bool CostData::SetCostData(const ProgramDesc& program,
     double cpu_time_ms = main_thread_events[start_profiler_idx].CpuElapsedMs(
         main_thread_events[stop_profiler_idx]);
     double gpu_time_ms = 0;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     gpu_time_ms = main_thread_events[start_profiler_idx].CudaElapsedMs(
         main_thread_events[stop_profiler_idx]);
 #endif
diff --git a/paddle/fluid/framework/ir/fuse_bn_act_pass.cc b/paddle/fluid/framework/ir/fuse_bn_act_pass.cc
index 299e700edb95d..322fcb0f7cf48 100644
--- a/paddle/fluid/framework/ir/fuse_bn_act_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_bn_act_pass.cc
@@ -34,7 +34,7 @@ namespace framework {
 namespace ir {
 
 void FuseBatchNormActPass::ApplyImpl(ir::Graph *graph) const {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 4, 1)
   // forward
   std::unordered_set<std::string> act_types = {"relu"};
diff --git a/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc b/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc
index 506e8721298b6..a218e768ac41d 100644
--- a/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc
@@ -25,7 +25,7 @@ namespace framework {
 namespace ir {
 
 void FuseBatchNormAddActPass::ApplyImpl(ir::Graph *graph) const {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 4, 1)
   // forward
   std::unordered_set<std::string> act_types = {"relu"};
diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
index 06593733e6a27..7ffd09d2474df 100644
--- a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
+++ b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
@@ -27,7 +27,7 @@ namespace phi {
 class DenseTensor;
 }  // namespace phi
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
index a0f1d9eed0038..b986fc5b37adb 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
@@ -203,7 +203,7 @@ TEST(test_reference_count_pass, test_no_need_buffer_var_shrink) {
            {});
 
   std::vector<bool> use_cuda_list{false};
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   use_cuda_list.push_back(true);
 #endif
   for (auto use_cuda : use_cuda_list) {
diff --git a/paddle/fluid/framework/new_executor/interpreter/execution_config.cc b/paddle/fluid/framework/new_executor/interpreter/execution_config.cc
index 1e6a6f02e2230..aa769089d7fed 100644
--- a/paddle/fluid/framework/new_executor/interpreter/execution_config.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/execution_config.cc
@@ -53,7 +53,7 @@ inline std::tuple<int, int> GetThreadPoolConfig(const phi::Place& place,
     processor_count = std::thread::hardware_concurrency();
     if (processor_count) {
       if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
         device_count = phi::backends::gpu::GetGPUDeviceCount();
 #endif
       }
diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
index 13896b66f3c55..e70d6fabd5c05 100644
--- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
@@ -641,7 +641,7 @@ void BuildOpFuncList(const platform::Place& place,
             *op_with_kernel, *runtime_scope, *dev_ctx, runtime_context);
         auto expected_kernel_key = framework::TransPhiKernelKeyToOpKernelType(
             op_with_kernel->GetExpectedKernelType(exec_ctx));
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
         if (op_with_kernel->CanCUDNNBeUsed(exec_ctx,
                                            expected_kernel_key.data_type_)) {
           expected_kernel_key.library_type_ = framework::LibraryType::kCUDNN;
diff --git a/paddle/fluid/framework/new_executor/interpreter_base_impl.h b/paddle/fluid/framework/new_executor/interpreter_base_impl.h
index 1ae7e5e59ce1f..66a41274cd105 100644
--- a/paddle/fluid/framework/new_executor/interpreter_base_impl.h
+++ b/paddle/fluid/framework/new_executor/interpreter_base_impl.h
@@ -48,7 +48,7 @@ DECLARE_bool(benchmark);
 DECLARE_uint64(executor_log_deps_every_microseconds);
 PHI_DECLARE_bool(new_executor_use_cuda_graph);
 PHI_DECLARE_bool(enable_new_ir_in_executor);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PHI_DECLARE_bool(sync_nccl_allreduce);
 #endif
 
diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
index 3b40a3b0727f1..eae90f2a29739 100644
--- a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
@@ -892,7 +892,7 @@ void NewIRInterpreter::RunOperator(const Instruction& instr_node) {
   /*For profiling/benchmark only*/
   if (FLAGS_benchmark) {
     instr_node.DeviceContext().Wait();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
     VLOG(4) << "Operator(" << op->Type()
             << "): context wait and get last error";
@@ -1245,7 +1245,7 @@ void NewIRInterpreter::RecordStreamForGC(const Instruction& instr) {
 void NewIRInterpreter::CheckGC(const Instruction& instr) {
   platform::RecordEvent record(
       "CheckGC", platform::TracerEventType::UserDefined, 10);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   RecordStreamForGC(instr);
 #endif
   auto& var_scope = var_scope_;
diff --git a/paddle/fluid/framework/new_executor/profiler.h b/paddle/fluid/framework/new_executor/profiler.h
index 95eee77d36288..f2fa9fd50eedb 100644
--- a/paddle/fluid/framework/new_executor/profiler.h
+++ b/paddle/fluid/framework/new_executor/profiler.h
@@ -42,7 +42,7 @@ class ProfilerGuard {
  private:
   void TotalCUDAAllocatedMemorySize(const platform::Place& place) {
     if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       auto cuda_place = place;
       cost_info_->device_memory_bytes =
           platform::RecordedGpuMallocSize(cuda_place.device);
diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc
index b6c54192a6970..04cbca42c152a 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/program_interpreter.cc
@@ -880,7 +880,7 @@ void ProgramInterpreter::RunOperator(const Instruction& instr_node) {
   /*For profiling/benchmark only*/
   if (FLAGS_benchmark) {
     instr_node.DeviceContext().Wait();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
     VLOG(4) << "Operator(" << op->Type()
             << "): context wait and get last error";
@@ -1232,7 +1232,7 @@ void ProgramInterpreter::RecordStreamForGC(const Instruction& instr) {
 void ProgramInterpreter::CheckGC(const Instruction& instr) {
   platform::RecordEvent record(
       "CheckGC", platform::TracerEventType::UserDefined, 10);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   RecordStreamForGC(instr);
 #endif
   auto& var_scope = var_scope_;
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index 8cb29a0d5df4c..db535b4fa58de 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -358,7 +358,7 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
       ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
       __VA_ARGS__)
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #define REGISTER_OP_CUDA_KERNEL(op_type, ...) \
   REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::CUDAPlace, __VA_ARGS__)
 #else
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 74e4b04d0535f..db04262563c4d 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -58,7 +58,7 @@ class DenseTensor;
 #include "paddle/fluid/platform/mkldnn_op_list.h"
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #endif
 
@@ -1516,7 +1516,7 @@ bool OperatorWithKernel::SupportsKernelType(
   }
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (this->CanCUDNNBeUsed(exe_ctx, kernel_type.data_type_)) {
     auto tmp_kernel_type = kernel_type;
     tmp_kernel_type.library_type_ = framework::LibraryType::kCUDNN;
@@ -1544,7 +1544,7 @@ bool OperatorWithKernel::CanCUDNNBeUsed(const framework::ExecutionContext& ctx,
   bool use_cudnn = ctx.HasAttr("use_cudnn") && ctx.Attr<bool>("use_cudnn") &&
                    paddle::platform::is_gpu_place(ctx.GetPlace());
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (use_cudnn) {
     auto& dev_ctx = ctx.device_context<phi::GPUContext>();
     use_cudnn &= (dev_ctx.cudnn_handle() != nullptr);
@@ -1783,7 +1783,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
       }
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       if (this->CanCUDNNBeUsed(exe_ctx, kernel_type_->data_type_)) {
         kernel_type_->library_type_ = framework::LibraryType::kCUDNN;
       }
@@ -2109,7 +2109,7 @@ OpKernelType OperatorWithKernel::InnerGetExpectedKernelType(
   }
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (this->CanCUDNNBeUsed(ctx, expected_kernel_key.data_type_)) {
     expected_kernel_key.library_type_ = framework::LibraryType::kCUDNN;
   }
@@ -2132,7 +2132,7 @@ OpKernelType OperatorWithKernel::InnerGetExpectedKernelType(
       // CPUKernel will be executed and a warning will be given at the same
       // time.
       expected_kernel_key.place_ = platform::CPUPlace();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       if (SupportGPU()) {
         auto& dev_ctx = ctx.device_context();
         expected_kernel_key.place_ = dev_ctx.GetPlace();
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index e6a2058107b1d..68df442f4a5fa 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -575,7 +575,7 @@ class ExecutionContext : public phi::KernelContext {
     return device_context_;
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   const inline phi::GPUContext& cuda_device_context() const {
     PADDLE_ENFORCE_EQ(platform::is_gpu_place(device_context_.GetPlace()),
                       true,
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 806b8570108b9..ccf4534bddbb2 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -41,14 +41,14 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
 #include "paddle/fluid/platform/flags.h"
 
 PHI_DECLARE_double(eager_delete_tensor_gb);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PHI_DECLARE_bool(sync_nccl_allreduce);
 #endif
 
@@ -69,7 +69,7 @@ static std::once_flag gProfileOnce;
 static bool gProfileStarted = false;
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 std::once_flag p2p_init_flag;
 #endif
 
@@ -512,7 +512,7 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
     }
     std::unique_ptr<GarbageCollector> gc;
     if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       if (IsFastEagerDeletionModeEnabled()) {
         gc.reset(new UnsafeFastGPUGarbageCollector(place, max_memory_size));
       } else {
@@ -621,7 +621,7 @@ bool ParallelExecutor::NeedCreateLocalExeScope() {
 }
 
 void InitP2P(const std::vector<platform::Place> &places) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   std::call_once(p2p_init_flag, [&]() {
     int count = places.size();
     if (count <= 1) return;
@@ -638,10 +638,14 @@ void InitP2P(const std::vector<platform::Place> &places) {
       for (int j = 0; j < count; ++j) {
         if (devices[i] == devices[j]) continue;
         int can_acess = -1;
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
         hipError_t ret =
             hipDeviceCanAccessPeer(&can_acess, devices[i], devices[j]);
         if (ret != hipSuccess || can_acess != 1) {
+#elif defined(PADDLE_WITH_MUSA)
+        musaError_t ret =
+            musaDeviceCanAccessPeer(&can_acess, devices[i], devices[j]);
+        if (ret != musaSuccess || can_acess != 1) {
 #else
         cudaError_t ret =
             cudaDeviceCanAccessPeer(&can_acess, devices[i], devices[j]);
@@ -651,8 +655,10 @@ void InitP2P(const std::vector<platform::Place> &places) {
                        << " to " << devices[j];
         } else {
           platform::CUDADeviceGuard guard(devices[i]);
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
           hipDeviceEnablePeerAccess(devices[j], 0);
+#elif defined(PADDLE_WITH_MUSA)
+          musaDeviceEnablePeerAccess(devices[j], 0);
 #else
           cudaDeviceEnablePeerAccess(devices[j], 0);
 #endif
@@ -1299,7 +1305,7 @@ void ParallelExecutor::InitExecutorPrivateMemberInfo(
         BuildStrategy::ReduceStrategy::kAllReduce;
     member_->use_all_reduce_ = true;
   }
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && defined(_WIN32)
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && defined(_WIN32)
   if (member_->IsUseCUDA(member_->use_device_)) {
     PADDLE_ENFORCE_EQ(
         device_count,
@@ -1308,7 +1314,7 @@ void ParallelExecutor::InitExecutorPrivateMemberInfo(
   }
 #endif
 
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && \
     (!defined(PADDLE_WITH_NCCL) && !defined(PADDLE_WITH_RCCL))
   if (member_->IsUseCUDA(member_->use_device_)) {
     PADDLE_ENFORCE_EQ(
@@ -1674,7 +1680,7 @@ std::vector<ir::Graph *> ParallelExecutor::CreateSSAGraphExecutor(
     final_graphs = *async_graphs;
   } else if (member_->build_strategy_.enable_parallel_graph_) {
     VLOG(3) << "use ParallelSSAGraphExecutor";
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     // TODO(Yancey1989): Remove passing in the main_program when
     // allreduce_seq_pass doesn't need it as the attr.
     bool is_inference = details::IsDataParallelInferenceGraph(*graph);
diff --git a/paddle/fluid/framework/phi_utils.cc b/paddle/fluid/framework/phi_utils.cc
index 9881d479a75a2..4d1bb616c33e2 100644
--- a/paddle/fluid/framework/phi_utils.cc
+++ b/paddle/fluid/framework/phi_utils.cc
@@ -134,7 +134,7 @@ phi::KernelKey FallBackToCpu(const phi::KernelKey& kernel_key,
         phi::Backend::CPU, kernel_key.layout(), kernel_key.dtype());
   }
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (kernel_key.backend() == phi::Backend::GPU ||
       kernel_key.backend() == phi::Backend::GPUDNN) {
     PADDLE_THROW(
diff --git a/paddle/fluid/framework/phi_utils.h b/paddle/fluid/framework/phi_utils.h
index f8589e95ff8e9..d5262264aa0cd 100644
--- a/paddle/fluid/framework/phi_utils.h
+++ b/paddle/fluid/framework/phi_utils.h
@@ -72,7 +72,7 @@ struct ConvertToPhiContext<phi::CPUContext> {
   using TYPE = phi::CPUContext;
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 template <>
 struct ConvertToPhiContext<phi::GPUContext> {
   using TYPE = phi::GPUContext;
diff --git a/paddle/fluid/framework/pull_dense_worker.cc b/paddle/fluid/framework/pull_dense_worker.cc
index 7b61052a20151..5cb310fd9a4a1 100644
--- a/paddle/fluid/framework/pull_dense_worker.cc
+++ b/paddle/fluid/framework/pull_dense_worker.cc
@@ -69,10 +69,10 @@ void PullDenseWorker::Initialize(const TrainerDesc& param) {
   fleet_ptr_ = FleetWrapper::GetInstance();
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   copy_streams_.clear();
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
     defined(PADDLE_WITH_XPU)
   places_.clear();
   thread_scopes_.clear();
@@ -80,7 +80,7 @@ void PullDenseWorker::Initialize(const TrainerDesc& param) {
 }
 
 void PullDenseWorker::CreatePinVar() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
     defined(PADDLE_WITH_XPU)
   // for (auto& v : dense_value_names_) {
   //  for (auto& name : v.second) {
@@ -96,7 +96,7 @@ void PullDenseWorker::CreatePinVar() {
       auto* ptr = root_scope_->Var(name + "pin");
       InitializeVariable(ptr, proto::VarType::LOD_TENSOR);
       phi::DenseTensor* pin_tensor = ptr->GetMutable<phi::DenseTensor>();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       pin_tensor->mutable_data<float>(tensor->dims(),
                                       platform::CUDAPinnedPlace());
 #endif
@@ -125,7 +125,7 @@ void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) {
     exit(-1);
   }
   status_vec->resize(0);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
     defined(PADDLE_WITH_XPU)
 
   for (size_t i = 0; i < places_.size(); ++i) {
@@ -144,7 +144,7 @@ void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) {
         Variable* var = thread_scopes_[i]->FindVar(name);
         phi::DenseTensor* tensor = var->GetMutable<phi::DenseTensor>();
         float* w = tensor->data<float>();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
         memory::Copy(places_[i],
                      w,
                      platform::CUDAPinnedPlace(),
@@ -179,7 +179,7 @@ void PullDenseWorker::PullDense(bool force_update) {
     uint64_t tid = static_cast<uint64_t>(
         dwp_param_.program_config(0).pull_dense_table_id(i));
     if (force_update || CheckUpdateParam(tid)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
     defined(PADDLE_WITH_XPU)
       VLOG(3) << "pull dense " << force_update << " " << tid;
       fleet_ptr_->PullDenseVarsAsync(*root_scope_,
diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc
index 58e879a5011c2..cd436becfbe93 100644
--- a/paddle/fluid/framework/section_worker.cc
+++ b/paddle/fluid/framework/section_worker.cc
@@ -228,7 +228,7 @@ void SectionWorker::TrainFiles() {
   int64_t max_memory_size = GetEagerDeletionThreshold();
   std::unique_ptr<GarbageCollector> gc;
   if (max_memory_size >= 0) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     if (platform::is_gpu_place(place_)) {
       if (IsFastEagerDeletionModeEnabled()) {
         gc.reset(new UnsafeFastGPUGarbageCollector(place_, max_memory_size));
diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc
index 5ef6f53d38d50..9b1e8ccf63e87 100644
--- a/paddle/fluid/framework/tensor_test.cc
+++ b/paddle/fluid/framework/tensor_test.cc
@@ -114,7 +114,7 @@ TEST(DenseTensor, MutableData) {
     EXPECT_EQ(static_cast<int>(p2[0]), 1);
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   {
     phi::DenseTensor src_tensor;
     float* p1 = nullptr;
@@ -168,7 +168,7 @@ TEST(DenseTensor, ShareDataWith) {
     ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   {
     phi::DenseTensor src_tensor;
     phi::DenseTensor dst_tensor;
@@ -206,7 +206,7 @@ TEST(DenseTensor, Slice) {
     EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address);
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   {
     phi::DenseTensor src_tensor;
     src_tensor.mutable_data<double>(phi::make_ddim({6, 9}),
@@ -295,7 +295,7 @@ TEST(DenseTensor, Split) {
       EXPECT_EQ(src_data_address + 2 * 2 * i * sizeof(int), split_data_address);
     }
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   {
     phi::DenseTensor src_tensor;
     src_tensor.mutable_data<double>(phi::make_ddim({6, 4}),
@@ -357,7 +357,7 @@ TEST(DenseTensor, Chunk) {
       EXPECT_EQ(src_data_address + 2 * 2 * i * sizeof(int), split_data_address);
     }
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   {
     phi::DenseTensor src_tensor;
     src_tensor.mutable_data<double>(phi::make_ddim({6, 4}),
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index d8224cb0dd72b..50f23057c61b1 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -124,7 +124,7 @@ void TensorCopyImpl(const TENSOR& src,
         "Copy from %s to %s is not supported.", src_place, dst_place));
   }
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
            platform::is_cuda_pinned_place(dst_place)) {
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
@@ -377,7 +377,7 @@ void TensorCopySync(const phi::DenseTensor& src,
         "Copy from %s to %s is not supported.", src_place, dst_place));
   }
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
            platform::is_cuda_pinned_place(dst_place)) {
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
@@ -479,7 +479,7 @@ void TensorToStream(std::ostream& os,
                       platform::errors::ResourceExhausted(
                           "tensor size %d overflow when writing tensor", size));
     if (platform::is_gpu_place(tensor.place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
       std::unique_ptr<char[]> buf(new char[kBufSize]);
       auto& gpu_dev_ctx = static_cast<const phi::GPUContext&>(dev_ctx);
@@ -613,7 +613,7 @@ void TensorFromStream(std::istream& is,
     if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
         platform::is_xpu_place(dev_ctx.GetPlace()) ||
         platform::is_custom_place(dev_ctx.GetPlace())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
     defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE)
       phi::DenseTensor cpu_tensor;
       cpu_tensor.Resize(phi::make_ddim(shape));
@@ -686,7 +686,7 @@ void TensorFromStream(std::istream& is,
     if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
         platform::is_xpu_place(dev_ctx.GetPlace()) ||
         platform::is_custom_place(dev_ctx.GetPlace())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
     defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE)
       phi::DenseTensor cpu_tensor;
       cpu_tensor.Resize(phi::make_ddim(dims));
@@ -809,7 +809,7 @@ void TensorFromDLPack(const ::DLTensor& dl_tensor, phi::DenseTensor* dst) {
   if (dl_tensor.device.device_type == kDLCPU) {
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (dl_tensor.device.device_type == kDLGPU) {
     platform::CUDAPlace dst_place =
         platform::CUDAPlace(dl_tensor.device.device_id);
@@ -849,7 +849,7 @@ void TensorFromDLPack(const DLManagedTensor* src, phi::DenseTensor* dst) {
     void* dst_ptr = GetDstPtrByDLDataType(type, dst, dst_place);
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (src->dl_tensor.device.device_type == kDLGPU) {
     platform::CUDAPlace dst_place =
         platform::CUDAPlace(src->dl_tensor.device.device_id);
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index 36a3e968251c9..77ab6f4918caf 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -129,7 +129,7 @@ void TensorFromArray(const T* src,
   if (platform::is_cpu_place(dst_place)) {
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   else if (platform::is_gpu_place(dst_place)) {  // NOLINT
     memory::Copy(dst_place,
                  dst_ptr,
@@ -175,7 +175,7 @@ void TensorFromVector(const std::vector<T>& src,
   if (platform::is_cpu_place(dst_place)) {
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   else if (platform::is_gpu_place(dst_place)) {  // NOLINT
     memory::Copy(dst_place,
                  dst_ptr,
@@ -304,7 +304,7 @@ void TensorToVector(const phi::DenseTensor& src,
   if (platform::is_cpu_place(src.place())) {
     memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   else if (platform::is_gpu_place(src.place())) {  // NOLINT
     memory::Copy(dst_place,
                  dst_ptr,
@@ -346,7 +346,7 @@ inline void TensorToVector(const phi::DenseTensor& src,
   if (platform::is_cpu_place(src.place())) {
     memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   else if (platform::is_gpu_place(src.place())) {  // NOLINT
     memory::Copy(dst_place,
                  dst_ptr,
diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc
index bda2681f57f31..89c4a764b86f2 100644
--- a/paddle/fluid/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
@@ -58,7 +58,7 @@ TEST(TensorCopy, Tensor) {
   }
   EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout());
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   {
     phi::DenseTensor src_tensor;
     phi::DenseTensor gpu_tensor;
@@ -153,7 +153,7 @@ TEST(TensorFromVector, Tensor) {
     delete cpu_place;
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   {
     std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
     phi::DenseTensor cpu_tensor;
@@ -232,7 +232,7 @@ TEST(TensorToVector, Tensor) {
       EXPECT_EQ(src_ptr[i], dst[i]);
     }
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   {
     std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
     phi::DenseTensor gpu_tensor;
@@ -323,7 +323,7 @@ TEST(TensorFromDLPack, Tensor) {
     }
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   {
     std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
     phi::DenseTensor cpu_tensor;
@@ -489,7 +489,7 @@ TEST(Tensor, FromAndToStream) {
     EXPECT_EQ(dst_tensor.dims(), src_tensor.dims());
     delete place;
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   {
     phi::DenseTensor gpu_tensor;
     gpu_tensor.Resize({2, 3});
diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h
index bf69bed9d4851..9a0d9880f5d04 100644
--- a/paddle/fluid/framework/trainer.h
+++ b/paddle/fluid/framework/trainer.h
@@ -172,7 +172,7 @@ class HeterServiceContext {
   int place_num_;
   Scope* scope_{nullptr};
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   gpuEvent_t event_;
 #endif
   std::vector<OperatorBase*> ops_;
@@ -204,7 +204,7 @@ class HeterXpuTrainer : public TrainerBase {
   virtual std::string GetDumpPath(int tid) { return ""; }
   virtual void InitDumpEnv() {}
   template <typename T>
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   void HeterMemCpy(phi::DenseTensor* tensor,
                    phi::DenseTensor* root_tensor,
                    const paddle::platform::Place& thread_place,
@@ -242,7 +242,7 @@ class HeterXpuTrainer : public TrainerBase {
   std::vector<Scope*> place_scopes_;
   BtObjectPool<HeterServiceContext> object_pool_;
   std::vector<platform::Place> places_;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   std::vector<gpuStream_t> copy_streams_;
   std::vector<gpuEvent_t> events_;
 #endif
diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h
index 2e188e6caa076..286ee379d82dd 100644
--- a/paddle/fluid/framework/var_type_traits.h
+++ b/paddle/fluid/framework/var_type_traits.h
@@ -59,7 +59,7 @@ class SparseCsrTensor;
 namespace paddle {
 
 namespace platform {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 class Communicator;
 class NCCLCommunicator;
@@ -189,7 +189,7 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl<
     FetchList,
     FeedList,
     operators::reader::OrderedMultiDeviceLoDTensorBlockingQueueHolder,
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     ncclUniqueId,
     platform::Communicator,
diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc
index be715a2a451ad..1d424e81ba5ef 100644
--- a/paddle/fluid/imperative/amp_auto_cast.cc
+++ b/paddle/fluid/imperative/amp_auto_cast.cc
@@ -138,7 +138,7 @@ AmpOperators::AmpOperators()
       block_ops_(new std::unordered_set<std::string>()),
       unsupported_fp16_ops_(new std::unordered_set<std::string>()),
       unsupported_bf16_ops_(new std::unordered_set<std::string>()) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   auto unsupported_ops_gpu_fp16 = std::get<2>(
       OpSupportedInfos("GPU", paddle::framework::proto::VarType::FP16));
   unsupported_fp16_ops_->insert(unsupported_ops_gpu_fp16.begin(),
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index 14b9bc5aae0bc..8c78f7af783dd 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -204,7 +204,7 @@ void TensorAdd(const VarType& src, VarType* dst) {
   }
 
   if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     PADDLE_TENSOR_ADD(float, phi::GPUContext);
     PADDLE_TENSOR_ADD(double, phi::GPUContext);
     PADDLE_TENSOR_ADD(phi::dtype::float16, phi::GPUContext);
@@ -313,7 +313,7 @@ void SelectedRowsAddToTensor(const VarType& src, VarType* dst) {
     return;                                                              \
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (paddle::platform::is_gpu_place(place)) {
     PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::GPUContext, float);
     PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::GPUContext, double);
@@ -321,7 +321,7 @@ void SelectedRowsAddToTensor(const VarType& src, VarType* dst) {
 #endif
     PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::CPUContext, float);
     PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::CPUContext, double);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   }
 #endif
 
@@ -364,7 +364,7 @@ void SelectedRowsAddTensor(const VarType& src_selected_rows_var,
     return;                                                            \
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (platform::is_gpu_place(place)) {
     PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::GPUContext, float);
     PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::GPUContext, double);
@@ -372,7 +372,7 @@ void SelectedRowsAddTensor(const VarType& src_selected_rows_var,
 #endif
     PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::CPUContext, float);
     PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::CPUContext, double);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   }
 #endif
 
@@ -425,7 +425,7 @@ std::shared_ptr<ReturnVarType> SelectedRowsMerge(const VarType& src1,
     return dst_var;                                                  \
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (paddle::platform::is_gpu_place(place)) {
     PADDLE_SELECTED_ROWS_ADD(phi::GPUContext, float);
     PADDLE_SELECTED_ROWS_ADD(phi::GPUContext, double);
@@ -441,7 +441,7 @@ std::shared_ptr<ReturnVarType> SelectedRowsMerge(const VarType& src1,
 #if defined(PADDLE_WITH_XPU)
     }
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   }
 #endif
 
@@ -712,7 +712,7 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr<VariableWrapper> var,
         }
       }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       if (paddle::platform::is_gpu_place(place)) {
         // sum selected rows firstly
         for (auto& var_info : tmp_grad_vars_) {
@@ -778,7 +778,7 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr<VariableWrapper> var,
           // Increase count
           IncreaseCurCnt();
         }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       }
 #endif
       tmp_grad_vars_.clear();
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index cda2fad5d7436..6401580096db8 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -205,7 +205,7 @@ PreparedOp PrepareImpl(
   }
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (op.CanCUDNNBeUsed(dygraph_exe_ctx, expected_kernel_key.dtype())) {
     expected_kernel_key.set_backend(phi::Backend::GPUDNN);
   }
@@ -555,7 +555,7 @@ static void PreparedOpRunImpl(
 
   if (FLAGS_benchmark) {
     dev_ctx->Wait();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
     VLOG(4) << "Operator(" << op.Type() << "): context wait and get last error";
 #endif
@@ -645,7 +645,7 @@ static void PreparedOpRunPtImpl(
 
   if (FLAGS_benchmark) {
     dev_ctx->Wait();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
     VLOG(4) << "Operator(" << op.Type() << "): context wait and get last error";
 #endif
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index ccb58d320221c..f7b67e027fb7b 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -106,7 +106,7 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
   if (gcs_.count(place) == 0) {
     std::unique_ptr<framework::GarbageCollector> gc;
     if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       gc.reset(new framework::DefaultStreamGarbageCollector(place, 0));
 
       VLOG(10) << "Created GarbageCollector at " << place;
@@ -116,7 +116,7 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
           "Please recompile or reinstall Paddle with GPU support."));
 #endif
     } else if (platform::is_cuda_pinned_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       gc.reset(new framework::CUDAPinnedGarbageCollector(place, 0));
 
       VLOG(10) << "Created GarbageCollector at " << place;
@@ -274,7 +274,7 @@ void Tracer::TraceOpImpl(const std::string& type,
 
   try {
     if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       platform::SetDeviceId(place.device);
 #else
       PADDLE_THROW(platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
index 65e149925e742..2580a2aa8ec2a 100644
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -36,7 +36,7 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) {
   // The parameters are on the cpu, therefore, synchronization is not necessary.
   if (!argument->use_gpu()) return;
@@ -209,7 +209,7 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
       argument->scope_valid(),
       true,
       platform::errors::PreconditionNotMet("The scope field should be valid"));
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (argument->use_gpu_valid()) {
     CopyParamsToGpu(argument);
   }
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
index ee29af1c13308..6ab7d83b8922d 100644
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
@@ -32,7 +32,7 @@ class IrParamsSyncAmongDevicesPass : public AnalysisPass {
   std::string repr() const override;
 
  private:
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   void CopyParamsToGpu(Argument *argument);
 #endif
 
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 25c7e7e2a03d4..ea5ad99ea0be0 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -32,7 +32,7 @@
 #include "paddle/fluid/inference/tensorrt/helper.h"
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PHI_DECLARE_uint64(initial_gpu_memory_in_mb);
 #endif
 
@@ -100,7 +100,7 @@ void AnalysisConfig::SetModel(const std::string &prog_file_path,
 void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
                                   int device_id,
                                   Precision precision_mode) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   use_gpu_ = true;
   memory_pool_init_size_mb_ = memory_pool_init_size_mb;
   FLAGS_initial_gpu_memory_in_mb = memory_pool_init_size_mb_;
@@ -630,7 +630,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
 }
 
 void AnalysisConfig::EnableCUDNN() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   use_cudnn_ = use_gpu_;
 #else
   LOG(ERROR) << "Please compile with CUDA first to use cuDNN";
@@ -928,7 +928,7 @@ void AnalysisConfig::Update() {
   }
 
   if (use_gpu() && use_cudnn_) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     if (!enable_ir_optim_) {
       LOG(ERROR) << "EnableCUDNN() only works when IR optimization is enabled.";
     } else {
@@ -1145,7 +1145,7 @@ void AnalysisConfig::SetCpuMathLibraryNumThreads(
 }
 
 float AnalysisConfig::fraction_of_gpu_memory_for_pool() const {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   // Get the GPU memory details and calculate the fraction of memory for the
   // GPU memory pool.
   size_t gpu_total, gpu_available;
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 56652c2f42cb7..12e893d72781f 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -99,7 +99,7 @@
 
 namespace paddle {
 namespace {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 void UpdatePrivateDeviceContext(InferGPUContext *gpu_context,
                                 GPUContextResource *gpu_resource,
                                 Place place_) {
@@ -270,7 +270,7 @@ bool PaddleTensorToDenseTensor(const PaddleTensor &pt,
                       false,
                       platform::errors::InvalidArgument(
                           "Only one choice can be made between CPU and XPU."));
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto *dev_ctx = static_cast<const phi::GPUContext *>(pool.Get(place));
     auto dst_gpu_place = place;
@@ -370,7 +370,7 @@ bool AnalysisPredictor::Init(
     return true;
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   // TODO(inference): Now only gpu with external stream support private
   // device_context.
   if (config_.use_gpu_ && config_.use_external_stream_) {
@@ -418,7 +418,7 @@ void AnalysisPredictor::InitPlace() {
                       platform::errors::InvalidArgument(
                           "Only one choice can be made between CPU and XPU."));
     place_ = paddle::platform::CUDAPlace(config_.gpu_device_id());
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     if (config_.thread_local_stream_enabled()) {
       LOG_FIRST_N(WARNING, 1) << "We will remove this interface in the future. "
                                  "Please use config.SetExecStream instead.";
@@ -489,14 +489,14 @@ void AnalysisPredictor::InitPlace() {
 }
 
 void AnalysisPredictor::InitResourceManager(void *stream) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   predictor_stream_ =
       ResourceManager::Instance().InitGPUResource(place_, stream);
 #endif
 }
 
 void AnalysisPredictor::InitDeviceContexts() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   // Init GPUContext.
   if (place_.GetType() == phi::AllocationType::GPU) {
     device_contexts_.emplace(
@@ -534,7 +534,7 @@ void AnalysisPredictor::InitDeviceContexts() {
 }
 
 void *AnalysisPredictor::GetExecStream() const {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (place_.GetType() == phi::AllocationType::GPU) {
     if (private_context_) {
       return predictor_stream_;
@@ -2151,7 +2151,7 @@ bool AnalysisPredictor::ZeroCopyRun() {
   return true;
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) {
   if (!private_context_) {
     PADDLE_THROW(platform::errors::Fatal(
@@ -2160,8 +2160,10 @@ bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) {
   }
 
   if (stream != predictor_stream_) {
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
     hipStreamSynchronize(static_cast<gpuStream_t>(predictor_stream_));
+#elif defined(PADDLE_WITH_HIP)
+    musaStreamSynchronize(static_cast<gpuStream_t>(predictor_stream_));
 #else
     cudaStreamSynchronize(static_cast<gpuStream_t>(predictor_stream_));
 #endif
@@ -2199,11 +2201,13 @@ void AnalysisPredictor::HookCollectShapeRangeInfo() {
     paddle::platform::DeviceContextPool &pool =
         paddle::platform::DeviceContextPool::Instance();
     if (config_.use_gpu()) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       auto *dev_ctx = pool.Get(place_);
       auto stream = static_cast<phi::GPUContext *>(dev_ctx)->stream();
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
       hipStreamSynchronize(stream);
+#elif defined(PADDLE_WITH_MUSA)
+      musaStreamSynchronize(stream);
 #else
       cudaStreamSynchronize(stream);
 #endif
@@ -2595,7 +2599,7 @@ AnalysisPredictor::~AnalysisPredictor() {
   if (config_.shape_range_info_collected()) {
     StatisticShapeRangeInfo();
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (predictor_stream_ != nullptr) {
     ResourceManager::Instance().DestroyGPUResource(predictor_stream_);
   }
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index bde6ca48741ad..36c5d13a84521 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -220,7 +220,7 @@ class AnalysisPredictor : public PaddlePredictor {
   ///
   bool ZeroCopyRun() override;
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   // Note: Can only be used under thread_local semantics.
   bool ExpRunWithExternalStream(const gpuStream_t stream);
 #endif
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index 28353150c265c..f69a434f36f83 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -250,7 +250,7 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
           false,
           platform::errors::InvalidArgument(
               "Only one choice can be made between CPU and XPU."));
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       platform::DeviceContextPool &pool =
           platform::DeviceContextPool::Instance();
       auto *dev_ctx = static_cast<const phi::GPUContext *>(pool.Get(place_));
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
index 37ee2b4df643d..8791d6dfe0266 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -108,7 +108,7 @@ T *Tensor::mutable_data(PlaceType place) {
       return tensor->mutable_data<T>(paddle::platform::CPUPlace());
     }
     case static_cast<int>(PlaceType::kGPU): {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       paddle::platform::CUDAPlace gpu_place(device_);
       auto *dev_ctxs = reinterpret_cast<const std::map<
           phi::Place,
@@ -204,7 +204,7 @@ void Tensor::CopyFromCpu(const T *data) {
     auto *t_data = tensor->mutable_data<T>(paddle::platform::CPUPlace());
     std::memcpy(static_cast<void *>(t_data), data, ele_size);
   } else if (place_ == PlaceType::kGPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
     paddle::platform::CUDAPlace gpu_place(device_);
     auto *dev_ctxs = reinterpret_cast<const std::map<
@@ -406,7 +406,7 @@ void Tensor::CopyToCpuImpl(T *data,
         "with IPU."));
 #endif
   } else if (place_ == PlaceType::kGPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     auto gpu_place = t_place;
     auto *dev_ctxs = reinterpret_cast<const std::map<
         phi::Place,
@@ -420,8 +420,10 @@ void Tensor::CopyToCpuImpl(T *data,
                          t_data,
                          ele_num * sizeof(T),
                          dev_ctx->stream());
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
     hipStreamSynchronize(dev_ctx->stream());
+#elif defined(PADDLE_WITH_MUSA)
+    musaStreamSynchronize(dev_ctx->stream());
 #else
     // async, return stream
     if (nullptr != exec_stream) {
@@ -821,7 +823,7 @@ void InternalUtils::CopyFromCpuWithIoStream(paddle_infer::Tensor *t,
     auto *t_data = tensor->mutable_data<T>(paddle::platform::CPUPlace());
     std::memcpy(static_cast<void *>(t_data), data, ele_size);
   } else if (t->place_ == PlaceType::kGPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     paddle::platform::CUDAPlace gpu_place(t->device_);
     auto *t_data = tensor->mutable_data<T>(gpu_place);
     paddle::memory::Copy(gpu_place,
@@ -891,7 +893,7 @@ void InternalUtils::CopyToCpuWithIoStream(paddle_infer::Tensor *t,
     std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T));
 #endif
   } else if (t->place_ == PlaceType::kGPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     paddle::memory::Copy(paddle::platform::CPUPlace(),
                          static_cast<void *>(data),
                          t_place,
diff --git a/paddle/fluid/inference/api/infer_context.cc b/paddle/fluid/inference/api/infer_context.cc
index 533363f1b25da..57a7625aaef58 100644
--- a/paddle/fluid/inference/api/infer_context.cc
+++ b/paddle/fluid/inference/api/infer_context.cc
@@ -21,7 +21,7 @@
 
 namespace paddle {
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 InferGPUContext::InferGPUContext(const phi::Place& place)
     : phi::GPUContext(place, false) {}
 #endif
diff --git a/paddle/fluid/inference/api/infer_context.h b/paddle/fluid/inference/api/infer_context.h
index 2b5c4e974eb08..19f285ad78b65 100644
--- a/paddle/fluid/inference/api/infer_context.h
+++ b/paddle/fluid/inference/api/infer_context.h
@@ -26,7 +26,7 @@ class InferCPUContext : public phi::CPUContext {
   using phi::CPUContext::SetEigenDevice;
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 class InferGPUContext : public phi::GPUContext {
  public:
   explicit InferGPUContext(const phi::Place& place);
diff --git a/paddle/fluid/inference/api/resource_manager.cc b/paddle/fluid/inference/api/resource_manager.cc
index 3f06ee5722af9..9f5df0edfa06c 100644
--- a/paddle/fluid/inference/api/resource_manager.cc
+++ b/paddle/fluid/inference/api/resource_manager.cc
@@ -44,7 +44,7 @@
 namespace paddle {
 namespace internal {
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 class EigenGpuStreamDevice : public Eigen::StreamInterface {
  public:
   EigenGpuStreamDevice() : scratch_(nullptr), semaphore_(nullptr) {
@@ -99,9 +99,12 @@ class EigenGpuStreamDevice : public Eigen::StreamInterface {
     if (semaphore_ == NULL) {
       char* scratch = static_cast<char*>(scratchpad()) + Eigen::kGpuScratchSize;
       semaphore_ = reinterpret_cast<unsigned int*>(scratch);
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream_));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          musaMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream_));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream_));
@@ -132,7 +135,7 @@ void CPUContextResource::InitCPUResource() {
 
 CPUContextResource::CPUContextResource() { InitCPUResource(); }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 GPUContextResource::GPUContextResource(const phi::Place& place, void* stream)
     : place_(place) {
   InitGPUResource(stream);
@@ -156,8 +159,10 @@ void GPUContextResource::InitGPUResource(void* stream) {
 
 void GPUContextResource::DestroyGPUResource() {
   if (owned_stream_) {
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
     PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(stream_));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaStreamDestroy(stream_));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(stream_));
 #endif
@@ -375,7 +380,7 @@ CPUContextResource* ResourceManager::GetCPUResource() const {
   return cpu_resource_.get();
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 void* ResourceManager::InitGPUResource(const phi::Place& place, void* stream) {
   std::lock_guard<std::mutex> lock_gurad(gpu_mutex_);
   if (gpu_resources_.count(stream)) {
diff --git a/paddle/fluid/inference/api/resource_manager.h b/paddle/fluid/inference/api/resource_manager.h
index e14de1c2ffc86..9686761029374 100644
--- a/paddle/fluid/inference/api/resource_manager.h
+++ b/paddle/fluid/inference/api/resource_manager.h
@@ -25,7 +25,7 @@
 #include "paddle/phi/common/place.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/device/gpu/gpu_types.h"
 #include "paddle/phi/backends/gpu/forwards.h"
 #include "paddle/phi/backends/gpu/gpu_decls.h"
@@ -49,7 +49,7 @@ class CPUContextResource {
   std::unique_ptr<Eigen::DefaultDevice> cpu_eigen_device_;
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 class GPUContextResource {
  public:
   explicit GPUContextResource(const phi::Place& place, void* stream);
@@ -141,7 +141,7 @@ class ResourceManager {
   std::mutex cpu_mutex_;
   std::unique_ptr<CPUContextResource> cpu_resource_{nullptr};
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   // GPU Resource
  public:
   void* InitGPUResource(const phi::Place& place, void* stream);
diff --git a/paddle/fluid/inference/lite/tensor_utils.cc b/paddle/fluid/inference/lite/tensor_utils.cc
index 6de5f9cfa0ca1..509b3f0b993f8 100644
--- a/paddle/fluid/inference/lite/tensor_utils.cc
+++ b/paddle/fluid/inference/lite/tensor_utils.cc
@@ -127,7 +127,7 @@ void MemoryCopyAsync(const platform::Place& dst_place,
   if (platform::is_cpu_place(dst_place) && platform::is_cpu_place(src_place)) {
     memory::Copy(cpu_place, dst_data, cpu_place, src_data, size);
   } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     if (platform::is_cpu_place(dst_place) &&
         platform::is_gpu_place(src_place)) {
       PADDLE_THROW(platform::errors::Unimplemented(
diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
index 2500f624967c6..ed2993e7a39e7 100644
--- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
@@ -215,9 +215,12 @@ void QkvToContextPluginDynamic::configurePlugin(
       fake_qk_bias_ = reinterpret_cast<float *>(
           tensor_.mutable_data<int32_t>(platform::CUDAPlace(device_id)));
       int64_t size = sizeof(int32_t) * batch * seq_len * seq_len * head_number_;
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipMemsetAsync(fake_qk_bias_, 0, size, dev_ctx.stream()));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          musaMemsetAsync(fake_qk_bias_, 0, size, dev_ctx.stream()));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaMemsetAsync(fake_qk_bias_, 0, size, dev_ctx.stream()));
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index 1a39590398911..aa96228a694d5 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -19,7 +19,7 @@ set(ALLOCATOR_SRCS
     buddy_allocator.cc
     system_allocator.cc)
 
-if(WITH_GPU OR WITH_ROCM)
+if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
   list(
     APPEND
     ALLOCATOR_SRCS
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 07e55115ba130..41635de256abe 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -27,7 +27,7 @@
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/core/macros.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include <shared_mutex>
 
 #include "paddle/fluid/memory/allocation/cuda_allocator.h"
@@ -164,7 +164,7 @@ class AllocatorFacadePrivate {
  public:
   using AllocatorMap = std::map<platform::Place, std::shared_ptr<Allocator>>;
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   using CUDAAllocatorMap =
       std::map<platform::CUDAPlace,
                std::map<gpuStream_t, std::shared_ptr<Allocator>>>;
@@ -187,7 +187,7 @@ class AllocatorFacadePrivate {
           InitNaiveBestFitIPUAllocator(platform::IPUPlace(dev_id));
         }
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
         for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
           InitNaiveBestFitCUDAAllocator(platform::CUDAPlace(dev_id));
         }
@@ -214,7 +214,7 @@ class AllocatorFacadePrivate {
 
       case AllocatorStrategy::kAutoGrowth: {
         InitNaiveBestFitCPUAllocator();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
         allow_free_idle_chunk_ = allow_free_idle_chunk;
         for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
           InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id),
@@ -286,7 +286,7 @@ class AllocatorFacadePrivate {
           InitNaiveBestFitIPUAllocator(platform::IPUPlace(dev_id));
         }
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
         for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
           InitThreadLocalCUDAAllocator(platform::CUDAPlace(dev_id));
         }
@@ -345,7 +345,7 @@ class AllocatorFacadePrivate {
            LIKELY(FLAGS_use_system_allocator == false);
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   bool HasCUDAAllocator(const platform::CUDAPlace& place, gpuStream_t stream) {
     auto it = cuda_allocators_.find(place);
     if (it == cuda_allocators_.end()) {
@@ -594,7 +594,7 @@ class AllocatorFacadePrivate {
 #endif
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   void InitNaiveBestFitCUDAPinnedAllocator() {
     allocators_[platform::CUDAPinnedPlace()] =
         std::make_shared<NaiveBestFitAllocator>(platform::CUDAPinnedPlace());
@@ -1038,7 +1038,7 @@ class AllocatorFacadePrivate {
       system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
     }
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     system_allocators_[platform::CUDAPinnedPlace()] =
         std::make_shared<CPUPinnedAllocator>();
     int device_count = platform::GetGPUDeviceCount();
@@ -1064,7 +1064,7 @@ class AllocatorFacadePrivate {
     if (!zero_size_allocators_.empty()) return;
     std::vector<platform::Place> places;
     places.emplace_back(platform::CPUPlace());
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     int device_count = platform::GetGPUDeviceCount();
     for (int dev_id = 0; dev_id < device_count; ++dev_id) {
       places.emplace_back(platform::CUDAPlace(dev_id));
@@ -1112,7 +1112,7 @@ class AllocatorFacadePrivate {
     CheckAllocThreadSafe(allocators_);
     CheckAllocThreadSafe(zero_size_allocators_);
     CheckAllocThreadSafe(system_allocators_);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     if (is_stream_safe_cuda_allocator_used_) {
       CheckCUDAAllocThreadSafe(cuda_allocators_);
     }
@@ -1145,7 +1145,7 @@ class AllocatorFacadePrivate {
     }
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   // a standalone CUDA allocator to support multi-stream GC in new executor
   std::map<platform::Place, std::shared_ptr<StreamSafeCUDAAllocator>>
       default_stream_safe_cuda_allocators_;
@@ -1252,7 +1252,7 @@ std::shared_ptr<phi::Allocation> AllocatorFacade::AllocShared(
 AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
                                      size_t size,
                                      const phi::Stream& stream) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   AllocatorFacadePrivate* m = GetPrivate();
   if (!m->IsStreamSafeCUDAAllocatorUsed()) {
     VLOG(6) << "Warning: StreamSafeCUDAAllocator is not used!";
@@ -1278,7 +1278,7 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
 bool AllocatorFacade::InSameStream(
     const std::shared_ptr<phi::Allocation>& allocation,
     const phi::Stream& stream) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id());
   return s == GetStream(allocation);
 #else
@@ -1290,7 +1290,7 @@ bool AllocatorFacade::IsStreamSafeCUDAAllocatorUsed() {
   return GetPrivate()->IsStreamSafeCUDAAllocatorUsed();
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place,
                                   gpuStream_t stream) {
   AllocatorFacadePrivate* m = GetPrivate();
diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h
index a1f21a5e69359..6f1b495891338 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.h
+++ b/paddle/fluid/memory/allocation/allocator_facade.h
@@ -76,7 +76,7 @@ class AllocatorFacade {
 
   bool IsStreamSafeCUDAAllocatorUsed();
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   // TODO(zhiqiu): change gpuStream_t to phi::Stream if needed.
   uint64_t Release(const platform::CUDAPlace& place, gpuStream_t stream);
   void RecordStream(std::shared_ptr<Allocation> allocation, gpuStream_t stream);
diff --git a/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc b/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc
index 1e09c43c4f12f..0cf8089f5a65f 100644
--- a/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc
@@ -17,7 +17,7 @@
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/core/flags.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PHI_DECLARE_double(fraction_of_gpu_memory_to_use);
 PHI_DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
 PHI_DECLARE_uint64(initial_gpu_memory_in_mb);
@@ -46,7 +46,7 @@ void AllocateTestCases() {
     ASSERT_EQ(cpu_allocation->size(), size);
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   {
     place = platform::CUDAPlace(0);
     size = 1024;
@@ -82,7 +82,7 @@ void AllocateTestCases() {
 }
 
 TEST(Allocator, SpecifyGpuMemory) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   // Set to 0.0 to test FLAGS_initial_gpu_memory_in_mb and
   // FLAGS_reallocate_gpu_memory_in_mb
   FLAGS_fraction_of_gpu_memory_to_use = 0.0;
diff --git a/paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc b/paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc
index 63e3eab3256c9..b60b53bc28f3c 100644
--- a/paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc
@@ -17,7 +17,7 @@
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/core/flags.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PHI_DECLARE_double(fraction_of_gpu_memory_to_use);
 PHI_DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
 PHI_DECLARE_uint64(initial_gpu_memory_in_mb);
@@ -46,7 +46,7 @@ void AllocateTestCases() {
     ASSERT_EQ(cpu_allocation->size(), size);
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   {
     place = platform::CUDAPlace(0);
     size = 1024;
@@ -82,7 +82,7 @@ void AllocateTestCases() {
 }
 
 TEST(Allocator, Allocator) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   FLAGS_fraction_of_gpu_memory_to_use = 0.01;
   FLAGS_gpu_allocator_retry_time = 500;
   FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc
index bfd05b6b323fe..b4d4699f1f039 100644
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc
@@ -23,7 +23,7 @@
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/phi/core/flags.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PHI_DECLARE_double(fraction_of_gpu_memory_to_use);
 PHI_DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
 DECLARE_int64(gpu_allocator_retry_time);
@@ -41,7 +41,7 @@ static inline size_t AlignTo(size_t size, size_t alignment) {
 }
 
 TEST(allocator, allocator) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   FLAGS_fraction_of_gpu_memory_to_use = 0.01;
   FLAGS_gpu_allocator_retry_time = 500;
   FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
@@ -102,7 +102,7 @@ TEST(allocator, allocator) {
 
 TEST(multithread_allocate, test_segfault) {
   FLAGS_allocator_strategy = "auto_growth";
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   std::mutex mtx;
   std::condition_variable cv;
   bool flag = false;
diff --git a/paddle/fluid/memory/allocation/buddy_allocator.cc b/paddle/fluid/memory/allocation/buddy_allocator.cc
index 8de464754cb35..9c1402374b323 100644
--- a/paddle/fluid/memory/allocation/buddy_allocator.cc
+++ b/paddle/fluid/memory/allocation/buddy_allocator.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "glog/logging.h"
 #include "paddle/phi/core/flags.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #define USE_DEVICE
 PHI_DECLARE_uint64(reallocate_gpu_memory_in_mb);
 #endif
@@ -53,7 +53,7 @@ BuddyAllocator::BuddyAllocator(
           platform::PlaceHelper::CreatePlace(dev_type));
     };
   } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     init_allocate_size_func_ = &platform::GpuInitAllocSize;
     re_allocate_size_func_ = &platform::GpuReallocSize;
 #endif
@@ -249,7 +249,7 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
   allocate_bytes = DeviceAllocateSize(
       init_allocate_size_func_, re_allocate_size_func_, request_bytes);
 #else
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   allocate_bytes = DeviceAllocateSize(
       &platform::GpuInitAllocSize, &platform::GpuReallocSize, request_bytes);
 #endif
diff --git a/paddle/fluid/memory/allocation/buddy_allocator_test.cc b/paddle/fluid/memory/allocation/buddy_allocator_test.cc
index 1aeb1722d0ec8..6b99499824cfb 100644
--- a/paddle/fluid/memory/allocation/buddy_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/buddy_allocator_test.cc
@@ -26,7 +26,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/phi/core/flags.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PHI_DECLARE_double(fraction_of_gpu_memory_to_use);
 PHI_DECLARE_uint64(initial_gpu_memory_in_mb);
 PHI_DECLARE_uint64(reallocate_gpu_memory_in_mb);
@@ -77,7 +77,7 @@ int* TestBuddyAllocator(BuddyAllocator* allocator,
   return nullptr;
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 TEST(BuddyAllocator, GpuFraction) {
   // In a 16 GB machine, the pool size will be about 160 MB
   FLAGS_fraction_of_gpu_memory_to_use = 0.01;
diff --git a/paddle/fluid/memory/allocation/cuda_allocator.cc b/paddle/fluid/memory/allocation/cuda_allocator.cc
index 781addd7dba60..da5fdc829e8c0 100644
--- a/paddle/fluid/memory/allocation/cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_allocator.cc
@@ -19,6 +19,12 @@
 #include <cuda_runtime.h>
 #endif
 
+#ifdef PADDLE_WITH_MUSA
+#include <musa.h>
+#include <musa_runtime.h>
+#endif
+
+#ifdef PADDLE_WITH_HIP
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
diff --git a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
index 7286f84160c6a..1401aeb7a11be 100644
--- a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
+++ b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
@@ -79,9 +79,12 @@ class GPUContextAllocator : public Allocator {
                                gpuStream_t default_stream)
       : place_(place), default_stream_(default_stream) {
     platform::CUDADeviceGuard guard(place_.device);
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
     PADDLE_ENFORCE_GPU_SUCCESS(
         hipEventCreateWithFlags(&event_, hipEventDisableTiming));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        musaEventCreate(&event_, musaEventDisableTiming));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(
         cudaEventCreate(&event_, cudaEventDisableTiming));
@@ -91,9 +94,10 @@ class GPUContextAllocator : public Allocator {
   ~GPUContextAllocator() {
     if (event_) {
       platform::CUDADeviceGuard guard(place_.device);
-#ifdef PADDLE_WITH_HIP
-
+#if defined(PADDLE_WITH_HIP)
       PADDLE_WARN_GPU_SUCCESS(hipEventDestroy(event_));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_WARN_GPU_SUCCESS(musaEventDestroy(event_));
 #else
       PADDLE_WARN_GPU_SUCCESS(cudaEventDestroy(event_));
 #endif
@@ -110,9 +114,12 @@ class GPUContextAllocator : public Allocator {
     auto allocation = new GPUContextAllocation(
         static_unique_ptr_cast<Allocation>(memory::Alloc(place_, size)));
 // Wait for the event on stream
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, default_stream_));
     PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(default_stream_, event_, 0));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event_, default_stream_));
+    PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(default_stream_, event_, 0));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, default_stream_));
     PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(default_stream_, event_, 0));
diff --git a/paddle/fluid/memory/allocation/cuda_managed_allocator.cc b/paddle/fluid/memory/allocation/cuda_managed_allocator.cc
index 06e9fbe88827b..d1b68212736ee 100644
--- a/paddle/fluid/memory/allocation/cuda_managed_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_managed_allocator.cc
@@ -19,6 +19,11 @@
 #include <cuda_runtime.h>
 #endif
 
+#ifdef PADDLE_WITH_MUSA
+#include <musa.h>
+#include <musa_runtime.h>
+#endif
+
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
index e436e6c439081..27a6e3857f224 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -26,7 +26,7 @@
 #include "paddle/fluid/string/printf.h"
 #include "paddle/fluid/string/split.h"
 #include "paddle/phi/common/place.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
 #include "paddle/fluid/platform/flags.h"
@@ -213,7 +213,7 @@ size_t Used<platform::XPUPlace>(const platform::XPUPlace &place) {
 }
 
 // For CUDA
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 class GPUBuddyAllocatorList {
  private:
   GPUBuddyAllocatorList() : devices_(platform::GetSelectedDevices()) {
@@ -294,7 +294,7 @@ size_t Used<platform::CUDAPlace>(const platform::CUDAPlace &place) {
 template <>
 void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
                                  size_t size) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   auto *buddy_allocator = GetGPUBuddyAllocator(place.device);
   auto *ptr = buddy_allocator->Alloc(size);
   if (ptr == nullptr) {
@@ -313,8 +313,10 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
         string::HumanReadableSize(Used<platform::CUDAPlace>(place))));
   } else {
     if (FLAGS_init_allocated_mem) {
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
       hipMemset(ptr, 0xEF, size);
+#elif defined(PADDLE_WITH_MUSA)
+      musaMemset(ptr, 0xEF, size);
 #else
       cudaMemset(ptr, 0xEF, size);
 #endif
@@ -331,7 +333,7 @@ template <>
 void Free<platform::CUDAPlace>(const platform::CUDAPlace &place,
                                void *p,
                                size_t size) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   GetGPUBuddyAllocator(place.device)->Free(p);
 #else
   PADDLE_THROW(platform::errors::PermissionDenied(
@@ -341,7 +343,7 @@ void Free<platform::CUDAPlace>(const platform::CUDAPlace &place,
 
 template <>
 uint64_t Release<platform::CUDAPlace>(const platform::CUDAPlace &place) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   return GetGPUBuddyAllocator(place.device)->Release();
 #else
   PADDLE_THROW(platform::errors::PermissionDenied(
@@ -349,7 +351,7 @@ uint64_t Release<platform::CUDAPlace>(const platform::CUDAPlace &place) {
 #endif
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 BuddyAllocator *GetCUDAPinnedBuddyAllocator() {
   static std::once_flag init_flag;
   static BuddyAllocator *ba = nullptr;
@@ -367,7 +369,7 @@ BuddyAllocator *GetCUDAPinnedBuddyAllocator() {
 
 template <>
 size_t Used<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   return GetCUDAPinnedBuddyAllocator()->Used();
 #else
   PADDLE_THROW(platform::errors::PermissionDenied(
@@ -378,7 +380,7 @@ size_t Used<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place) {
 template <>
 void *Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
                                        size_t size) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
   auto *buddy_allocator = GetCUDAPinnedBuddyAllocator();
   void *ptr = buddy_allocator->Alloc(size);
@@ -401,7 +403,7 @@ template <>
 void Free<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
                                      void *p,
                                      size_t size) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   VLOG(10) << "Free " << size << " bytes on " << platform::Place(place);
   GetCUDAPinnedBuddyAllocator()->Free(p);
 #else
@@ -413,7 +415,7 @@ void Free<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
 template <>
 uint64_t Release<platform::CUDAPinnedPlace>(
     const platform::CUDAPinnedPlace &place) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   VLOG(10) << "Release on " << platform::Place(place);
   return GetCUDAPinnedBuddyAllocator()->Release();
 #else
@@ -602,7 +604,7 @@ size_t Usage::operator()(const platform::CPUPlace &cpu) const {
 }
 
 size_t Usage::operator()(const platform::CUDAPlace &gpu) const {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   return Used(gpu);
 #else
   PADDLE_THROW(platform::errors::PermissionDenied(
@@ -611,7 +613,7 @@ size_t Usage::operator()(const platform::CUDAPlace &gpu) const {
 }
 
 size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   return Used(cuda_pinned);
 #else
   PADDLE_THROW(platform::errors::PermissionDenied(
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc
index 37da748ee9c96..5ad4a729a6692 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc
@@ -33,7 +33,7 @@ TEST(NaiveBestFitAllocatorTest, CpuAlloc) {
   alloc.Release(platform::CPUPlace());
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 TEST(NaiveBestFitAllocatorTest, GpuAlloc) {
   NaiveBestFitAllocator alloc{platform::CUDAPlace(0)};
   {
diff --git a/paddle/fluid/memory/allocation/pinned_allocator.cc b/paddle/fluid/memory/allocation/pinned_allocator.cc
index f1c0178fafc02..33c6ca55880cd 100644
--- a/paddle/fluid/memory/allocation/pinned_allocator.cc
+++ b/paddle/fluid/memory/allocation/pinned_allocator.cc
@@ -21,8 +21,10 @@ namespace memory {
 namespace allocation {
 bool CPUPinnedAllocator::IsAllocThreadSafe() const { return true; }
 void CPUPinnedAllocator::FreeImpl(phi::Allocation *allocation) {
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
   PADDLE_ENFORCE_GPU_SUCCESS(hipHostFree(allocation->ptr()));
+#elif defined(PADDLE_WITH_MUSA)
+  PADDLE_ENFORCE_GPU_SUCCESS(musaHostFree(allocation->ptr()));
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(cudaFreeHost(allocation->ptr()));
 #endif
@@ -35,8 +37,10 @@ void CPUPinnedAllocator::FreeImpl(phi::Allocation *allocation) {
 }
 phi::Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) {
   void *ptr;
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
   PADDLE_ENFORCE_GPU_SUCCESS(hipHostMalloc(&ptr, size, hipHostMallocPortable));
+#elif defined(PADDLE_WITH_MUSA)
+  PADDLE_ENFORCE_GPU_SUCCESS(musaHostMalloc(&ptr, size, musaHostMallocPortable));
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(cudaHostAlloc(&ptr, size, cudaHostAllocPortable));
 #endif
diff --git a/paddle/fluid/memory/allocation/retry_allocator_test.cc b/paddle/fluid/memory/allocation/retry_allocator_test.cc
index d1872ee00b7b7..ef8692b64cc51 100644
--- a/paddle/fluid/memory/allocation/retry_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/retry_allocator_test.cc
@@ -19,7 +19,7 @@
 #include "gtest/gtest.h"
 #include "paddle/fluid/memory/allocation/best_fit_allocator.h"
 #include "paddle/fluid/memory/allocation/cpu_allocator.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/memory/allocation/cuda_allocator.h"
 #endif
 
@@ -114,7 +114,7 @@ TEST(RetryAllocator, RetryAllocatorLastAllocFailure) {
     }
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   {
     platform::CUDAPlace p(0);
     RetryAllocator allocator(std::make_shared<CUDAAllocator>(p), retry_ms);
diff --git a/paddle/fluid/memory/allocation/system_allocator.cc b/paddle/fluid/memory/allocation/system_allocator.cc
index 210be01669775..4234b615c823b 100644
--- a/paddle/fluid/memory/allocation/system_allocator.cc
+++ b/paddle/fluid/memory/allocation/system_allocator.cc
@@ -33,7 +33,7 @@ limitations under the License. */
 #include "paddle/phi/backends/cpu/cpu_info.h"
 #include "paddle/phi/core/flags.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
 
@@ -120,7 +120,7 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) {
 
 bool CPUAllocator::UseGpu() const { return false; }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 void* GPUAllocator::Alloc(size_t* index, size_t size) {
   // CUDA documentation doesn't explain if cudaMalloc returns nullptr
@@ -214,8 +214,10 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
 
   void* p;
 // PINNED memory is visible to all CUDA contexts.
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
   hipError_t result = hipHostMalloc(&p, size, hipHostMallocPortable);
+#elif defined(PADDLE_WITH_MUSA)
+  musaError_t result = musaHostMalloc(&p, size, musaHostMallocPortable);
 #else
   cudaError_t result = cudaHostAlloc(&p, size, cudaHostAllocPortable);
 #endif
diff --git a/paddle/fluid/memory/allocation/system_allocator.h b/paddle/fluid/memory/allocation/system_allocator.h
index 67376a3e39a22..b2cce04a04d37 100644
--- a/paddle/fluid/memory/allocation/system_allocator.h
+++ b/paddle/fluid/memory/allocation/system_allocator.h
@@ -43,7 +43,7 @@ class CPUAllocator : public SystemAllocator {
   virtual bool UseGpu() const;
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 class GPUAllocator : public SystemAllocator {
  public:
   explicit GPUAllocator(int gpu_id) : gpu_id_(gpu_id) {}
diff --git a/paddle/fluid/memory/allocation/system_allocator_test.cc b/paddle/fluid/memory/allocation/system_allocator_test.cc
index e04d14f0adfde..a296755c12725 100644
--- a/paddle/fluid/memory/allocation/system_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/system_allocator_test.cc
@@ -57,7 +57,7 @@ TEST(CPUAllocator, LockMem) {
   TestAllocator(&a, 0);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 TEST(GPUAllocator, Alloc) {
   paddle::memory::detail::GPUAllocator a(0);
   TestAllocator(&a, 2048);
diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc
index 46f9b1189cb68..f86d4f0f256ca 100644
--- a/paddle/fluid/memory/malloc.cc
+++ b/paddle/fluid/memory/malloc.cc
@@ -57,7 +57,7 @@ void* GetBasePtr(const std::shared_ptr<Allocation>& allocation) {
   return allocation::AllocatorFacade::Instance().GetBasePtr(allocation);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 uint64_t Release(const platform::CUDAPlace& place, gpuStream_t stream) {
   return allocation::AllocatorFacade::Instance().Release(place, stream);
 }
diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h
index b8f5f0289c4bc..bd67a4eeefcac 100644
--- a/paddle/fluid/memory/malloc.h
+++ b/paddle/fluid/memory/malloc.h
@@ -48,7 +48,7 @@ extern bool InSameStream(const std::shared_ptr<Allocation>& allocation,
 
 extern void* GetBasePtr(const std::shared_ptr<Allocation>& allocation);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 extern uint64_t Release(const platform::CUDAPlace& place, gpuStream_t stream);
 
 void RecordStream(std::shared_ptr<Allocation> allocation, gpuStream_t stream);
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index 4a56a01e640bf..45b2ec3ca3875 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -256,10 +256,10 @@ void Copy<phi::Place, phi::XPUPlace>(phi::Place dst_place,
 
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024;  // 64K
 
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
 inline void SyncCUDAStream() {
 #if !defined(_WIN32)
   hipStreamSynchronize(0);
@@ -271,6 +271,18 @@ inline void SyncCUDAStream() {
   }
 #endif
 }
+#elif defined(PADDLE_WITH_MUSA)
+inline void SyncCUDAStream() {
+#if !defined(_WIN32)
+  musaStreamSynchronize(0);
+#else
+  musaError_t e_sync = musaSuccess;
+  while (e_sync = musaStreamQuery(0)) {
+    if (e_sync == musaErrorNotReady) continue;
+    break;
+  }
+#endif
+}
 #else
 inline void SyncCUDAStream() {
 #if !defined(_WIN32)
@@ -307,12 +319,18 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>(
   if (stream) {
     platform::RecordEvent record_event(
         "GpuMemcpyAsync:GPU->CPU", platform::TracerEventType::UserDefined, 1);
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
     platform::GpuMemcpyAsync(dst,
                              src,
                              num,
                              hipMemcpyDeviceToHost,
                              reinterpret_cast<gpuStream_t>(stream));
+#elif defined(PADDLE_WITH_MUSA)
+    platform::GpuMemcpyAsync(dst,
+                             src,
+                             num,
+                             musaMemcpyDeviceToHost,
+                             reinterpret_cast<gpuStream_t>(stream));
 #else
     platform::GpuMemcpyAsync(dst,
                              src,
@@ -323,8 +341,10 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>(
   } else {
     platform::RecordEvent record_event(
         "GpuMemcpySync:GPU->CPU", platform::TracerEventType::UserDefined, 1);
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
     platform::GpuMemcpySync(dst, src, num, hipMemcpyDeviceToHost);
+#elif defined(PADDLE_WITH_MUSA)
+    platform::GpuMemcpySync(dst, src, num, musaMemcpyDeviceToHost);
 #else
     platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
 #endif
@@ -351,12 +371,18 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
   if (stream) {
     platform::RecordEvent record_event(
         "GpuMemcpyAsync:CPU->GPU", platform::TracerEventType::UserDefined, 1);
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
     platform::GpuMemcpyAsync(dst,
                              src,
                              num,
                              hipMemcpyHostToDevice,
                              reinterpret_cast<gpuStream_t>(stream));
+#elif defined(PADDLE_WITH_MUSA)
+    platform::GpuMemcpyAsync(dst,
+                             src,
+                             num,
+                             musaMemcpyHostToDevice,
+                             reinterpret_cast<gpuStream_t>(stream));
 #else
     platform::GpuMemcpyAsync(dst,
                              src,
@@ -367,8 +393,10 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
   } else {
     platform::RecordEvent record_event(
         "GpuMemcpySync:CPU->GPU", platform::TracerEventType::UserDefined, 1);
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
     platform::GpuMemcpySync(dst, src, num, hipMemcpyHostToDevice);
+#elif defined(PADDLE_WITH_MUSA)
+    platform::GpuMemcpySync(dst, src, num, musaMemcpyHostToDevice);
 #else
     platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
 #endif
@@ -397,12 +425,18 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
       platform::RecordEvent record_event("GpuMemcpyAsync(same_gpu):GPU->GPU",
                                          platform::TracerEventType::UserDefined,
                                          1);
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
       platform::GpuMemcpyAsync(dst,
                                src,
                                num,
                                hipMemcpyDeviceToDevice,
                                reinterpret_cast<gpuStream_t>(stream));
+#elif defined(PADDLE_WITH_MUSA)
+      platform::GpuMemcpyAsync(dst,
+                               src,
+                               num,
+                               musaMemcpyDeviceToDevice,
+                               reinterpret_cast<gpuStream_t>(stream));
 #else
       platform::GpuMemcpyAsync(dst,
                                src,
@@ -414,8 +448,10 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
       platform::RecordEvent record_event("GpuMemcpySync(same_gpu):GPU->GPU",
                                          platform::TracerEventType::UserDefined,
                                          1);
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
       platform::GpuMemcpySync(dst, src, num, hipMemcpyDeviceToDevice);
+#elif defined(PADDLE_WITH_MUSA)
+      platform::GpuMemcpySync(dst, src, num, musaMemcpyDeviceToDevice);
 #else
       platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToDevice);
 #endif
@@ -496,12 +532,18 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
     platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CUDAPinned",
                                        platform::TracerEventType::UserDefined,
                                        1);
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
     platform::GpuMemcpyAsync(dst,
                              src,
                              num,
                              hipMemcpyDeviceToHost,
                              reinterpret_cast<gpuStream_t>(stream));
+#elif defined(PADDLE_WITH_MUSA)
+    platform::GpuMemcpyAsync(dst,
+                             src,
+                             num,
+                             musaMemcpyDeviceToHost,
+                             reinterpret_cast<gpuStream_t>(stream));
 #else
     platform::GpuMemcpyAsync(dst,
                              src,
@@ -513,8 +555,10 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
     platform::RecordEvent record_event("GpuMemcpySync:GPU->CUDAPinned",
                                        platform::TracerEventType::UserDefined,
                                        1);
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
     platform::GpuMemcpySync(dst, src, num, hipMemcpyDeviceToHost);
+#elif defined(PADDLE_WITH_MUSA)
+    platform::GpuMemcpySync(dst, src, num, musaMemcpyDeviceToHost);
 #else
     platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
 #endif
@@ -538,12 +582,18 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
     platform::RecordEvent record_event("GpuMemcpyAsync:CUDAPinned->GPU",
                                        platform::TracerEventType::UserDefined,
                                        1);
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
     platform::GpuMemcpyAsync(dst,
                              src,
                              num,
                              hipMemcpyHostToDevice,
                              reinterpret_cast<gpuStream_t>(stream));
+#elif defined(PADDLE_WITH_MUSA)
+    platform::GpuMemcpyAsync(dst,
+                             src,
+                             num,
+                             musaMemcpyHostToDevice,
+                             reinterpret_cast<gpuStream_t>(stream));
 #else
     platform::GpuMemcpyAsync(dst,
                              src,
@@ -555,8 +605,10 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
     platform::RecordEvent record_event("GpuMemcpySync:CUDAPinned->GPU",
                                        platform::TracerEventType::UserDefined,
                                        1);
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
     platform::GpuMemcpySync(dst, src, num, hipMemcpyHostToDevice);
+#elif defined(PADDLE_WITH_MUSA)
+    platform::GpuMemcpySync(dst, src, num, musaMemcpyHostToDevice);
 #else
     platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
 #endif
@@ -746,7 +798,7 @@ void Copy<phi::Place, phi::Place>(phi::Place dst_place,
       dst_place.GetType() == phi::AllocationType::CPU) {
     std::memcpy(dst, src, num);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   else if (src_place.GetType() == phi::AllocationType::CPU &&  // NOLINT
            dst_place.GetType() == phi::AllocationType::GPUPINNED) {
     std::memcpy(dst, src, num);
diff --git a/paddle/fluid/memory/memory_stats_test.cc b/paddle/fluid/memory/memory_stats_test.cc
index 081f0d3d78c13..6afc2a852f0d6 100644
--- a/paddle/fluid/memory/memory_stats_test.cc
+++ b/paddle/fluid/memory/memory_stats_test.cc
@@ -40,7 +40,7 @@ TEST(stat_allocator_test, host_memory_stat_test) {
   EXPECT_EQ(HostMemoryStatPeakValue("Allocated", 0), max_alloc_size);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 TEST(stat_allocator_test, device_memory_stat_test) {
   std::vector<int64_t> alloc_sizes{
       5278, 9593, 8492, 5041, 3351, 4232, 3706, 5963, 5896, 5057, 7527,
diff --git a/paddle/fluid/operators/array_to_lod_tensor_op.cc b/paddle/fluid/operators/array_to_lod_tensor_op.cc
index d1dc7d8986bec..fde5de90c56dc 100644
--- a/paddle/fluid/operators/array_to_lod_tensor_op.cc
+++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc
@@ -55,7 +55,7 @@ struct ArrayToLoDFunctor {
     if (std::is_same<Place, platform::CPUPlace>::value) {
       Apply(static_cast<phi::CPUContext *>(pool.Get(place)));
     } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       Apply(static_cast<phi::GPUContext *>(pool.Get(place)));
 #else
       PADDLE_THROW(
diff --git a/paddle/fluid/operators/class_center_sample_op.cu b/paddle/fluid/operators/class_center_sample_op.cu
index f63baadbde526..5327be6909b4f 100644
--- a/paddle/fluid/operators/class_center_sample_op.cu
+++ b/paddle/fluid/operators/class_center_sample_op.cu
@@ -12,13 +12,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
 #include <hiprand.h>
 #include <hiprand_kernel.h>
 
 #include <hipcub/hipcub.hpp>
 typedef hiprandState curandState;
 namespace cub = hipcub;
+
+#elif defined(PADDLE_WITH_MUSA)
+#include <murand.h>
+#include <murand_kernel.h>
+#include <cub/cub.cuh>
 #else
 #include <curand.h>
 #include <curand_kernel.h>
@@ -67,11 +72,16 @@ __global__ void RandomSampleClassCenter(const int64_t n,
   size_t local_seed =
       (static_cast<size_t>(seed) + 0x9E3779B9U +
        (static_cast<size_t>(id) << 6U) + (static_cast<size_t>(id) >> 2U));
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
   hiprand_init(local_seed, id, increment, &localState);
   CUDA_KERNEL_LOOP(i, n) {
     buffer[i] = static_cast<T>(hiprand(&localState) % max_val);
   }
+#elif defined(PADDLE_WITH_MUSA)
+  murand_init(local_seed, id, increment, &localState);
+  CUDA_KERNEL_LOOP(i, n) {
+    buffer[i] = static_cast<T>(murand(&localState) % max_val);
+  }
 #else
   curand_init(local_seed, id, increment, &localState);
   CUDA_KERNEL_LOOP(i, n) {
diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op.h b/paddle/fluid/operators/collective/c_sync_calc_stream_op.h
index e100397924af5..79c32bc907045 100644
--- a/paddle/fluid/operators/collective/c_sync_calc_stream_op.h
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.h
@@ -39,7 +39,7 @@ template <typename T, typename DeviceContext>
 class CSyncCalcStreamKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && !defined(_WIN32)
 
     auto place = ctx.GetPlace();
     auto dev_ctx = static_cast<phi::GPUContext*>(
diff --git a/paddle/fluid/operators/collective/c_wait_comm_op.cc b/paddle/fluid/operators/collective/c_wait_comm_op.cc
index bacbe014a343c..f3a34f2c7d057 100644
--- a/paddle/fluid/operators/collective/c_wait_comm_op.cc
+++ b/paddle/fluid/operators/collective/c_wait_comm_op.cc
@@ -57,9 +57,12 @@ class CWaitCommOp : public framework::OperatorBase {
         platform::NCCLCommContext::Instance().Get(ring_id, place)->comm_event();
 
 // comm_stream-->event-->compute_stream
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, comm_stream));
     PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(compute_stream, event, 0));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event, comm_stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(compute_stream, event, 0));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, comm_stream));
     PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(compute_stream, event, 0));
diff --git a/paddle/fluid/operators/collective/c_wait_compute_op.cc b/paddle/fluid/operators/collective/c_wait_compute_op.cc
index 34569b0a4b600..4b9ca005be397 100644
--- a/paddle/fluid/operators/collective/c_wait_compute_op.cc
+++ b/paddle/fluid/operators/collective/c_wait_compute_op.cc
@@ -58,9 +58,12 @@ class CWaitComputeOp : public framework::OperatorBase {
                      ->compute_event();
 
 // compute_stream-->event-->comm_stream
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, compute_stream));
     PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(comm_stream, event, 0));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event, compute_stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(comm_stream, event, 0));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, compute_stream));
     PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(comm_stream, event, 0));
diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.h b/paddle/fluid/operators/controlflow/conditional_block_op.h
index 0f04a295ed263..d5419d2b13a4e 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_op.h
+++ b/paddle/fluid/operators/controlflow/conditional_block_op.h
@@ -77,7 +77,7 @@ class ConditionalOp : public framework::OperatorBase {
                           ips[0]->numel()));
     bool res = false;
     if (platform::is_gpu_place(ips[0]->place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       phi::DenseTensor cpu_tensor;
       framework::TensorCopy(*ips[0], platform::CPUPlace(), &cpu_tensor);
       platform::DeviceContextPool::Instance().Get(ips[0]->place())->Wait();
diff --git a/paddle/fluid/operators/controlflow/feed_op.cc b/paddle/fluid/operators/controlflow/feed_op.cc
index c2deeb4190986..6b85a1d08657b 100644
--- a/paddle/fluid/operators/controlflow/feed_op.cc
+++ b/paddle/fluid/operators/controlflow/feed_op.cc
@@ -218,7 +218,7 @@ PD_REGISTER_KERNEL_FOR_ALL_DTYPE(
     ALL_LAYOUT,
     paddle::operators::FeedSparseCooTensorKernel<phi::CPUContext>) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL_FOR_ALL_DTYPE(
     feed_sparse_coo_tensor,
     GPU,
diff --git a/paddle/fluid/operators/controlflow/get_places_op.cc b/paddle/fluid/operators/controlflow/get_places_op.cc
index 9f67b1d4b6e18..1074c1c30f676 100644
--- a/paddle/fluid/operators/controlflow/get_places_op.cc
+++ b/paddle/fluid/operators/controlflow/get_places_op.cc
@@ -26,7 +26,7 @@ namespace imperative {
 class OpBase;
 }  // namespace imperative
 }  // namespace paddle
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
 
@@ -34,7 +34,7 @@ namespace paddle {
 namespace operators {
 
 static size_t CUDADevCount() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   return platform::GetGPUDeviceCount();
 #else
   return 0UL;
diff --git a/paddle/fluid/operators/controlflow/while_op_helper.cc b/paddle/fluid/operators/controlflow/while_op_helper.cc
index 6ae32f33e957a..790f54612ffae 100644
--- a/paddle/fluid/operators/controlflow/while_op_helper.cc
+++ b/paddle/fluid/operators/controlflow/while_op_helper.cc
@@ -227,7 +227,7 @@ bool GetCondData(const phi::DenseTensor &cond) {
   // when platform::is_gpu_place(cond.place()) or
   // platform::is_xpu_place(cond.place()) is true
   std::unique_ptr<phi::DenseTensor> cpu_cond{new phi::DenseTensor()};
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
     defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE)
   framework::TensorCopySync(cond, platform::CPUPlace(), cpu_cond.get());
 #else
diff --git a/paddle/fluid/operators/detection/target_assign_op.h b/paddle/fluid/operators/detection/target_assign_op.h
index 484bd8454bae9..0fd2a6883943b 100644
--- a/paddle/fluid/operators/detection/target_assign_op.h
+++ b/paddle/fluid/operators/detection/target_assign_op.h
@@ -120,7 +120,7 @@ class TargetAssignKernel : public framework::OpKernel<T> {
     int64_t k = x->dims()[2];
 
     auto x_lod = x->lod().back();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     phi::MixVector<size_t> mixv_x_lod(&x_lod);
     size_t* x_lod_data = mixv_x_lod.MutableData(ctx.GetPlace());
 #else
@@ -137,7 +137,7 @@ class TargetAssignKernel : public framework::OpKernel<T> {
                                        k,
                                        out_data,
                                        out_wt_data);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     mixv_x_lod.CopyToCPU();
 #endif
 
@@ -154,7 +154,7 @@ class TargetAssignKernel : public framework::OpKernel<T> {
               "TargetAssignOp input(NegIndices) needs 1 level of LoD"));
       const int* neg_idx_data = neg_indices->data<int>();
       auto neg_lod = neg_indices->lod().back();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       phi::MixVector<size_t> mixv_neg_lod(&neg_lod);
       size_t* neg_lod_data = mixv_neg_lod.MutableData(ctx.GetPlace());
 #else
@@ -170,7 +170,7 @@ class TargetAssignKernel : public framework::OpKernel<T> {
                       mismatch_value,
                       out_data,
                       out_wt_data);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       mixv_neg_lod.CopyToCPU();
 #endif
     }
diff --git a/paddle/fluid/operators/dgc_op.h b/paddle/fluid/operators/dgc_op.h
index 45f34313d1a3d..f1d37e447991c 100644
--- a/paddle/fluid/operators/dgc_op.h
+++ b/paddle/fluid/operators/dgc_op.h
@@ -188,7 +188,7 @@ class DGCOpKernel : public framework::OpKernel<T> {
 
     int buf_size = paddle::communication::dgc::get_buffer_size(k);
     paddle::memory::allocation::AllocationPtr tmp_ious_data;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     if (platform::is_gpu_place(dev_ctx.GetPlace())) {
       tmp_ious_data = memory::Alloc(
           dev_ctx.GetPlace(),
diff --git a/paddle/fluid/operators/expand_as_op.cc b/paddle/fluid/operators/expand_as_op.cc
index 107fe9f6174b6..f0d31269da193 100644
--- a/paddle/fluid/operators/expand_as_op.cc
+++ b/paddle/fluid/operators/expand_as_op.cc
@@ -155,7 +155,7 @@ REGISTER_OP_CPU_KERNEL(expand_as_grad,
                        ops::ExpandAsGradKernel<phi::CPUContext, int64_t>,
                        ops::ExpandAsGradKernel<phi::CPUContext, float>,
                        ops::ExpandAsGradKernel<phi::CPUContext, double>);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 REGISTER_OP_CUDA_KERNEL(expand_as,
                         ops::ExpandAsKernel<phi::GPUContext, float>,
                         ops::ExpandAsKernel<phi::GPUContext, double>,
diff --git a/paddle/fluid/operators/expand_op.cc b/paddle/fluid/operators/expand_op.cc
index fee4b47049301..490c6f9f6dbfc 100644
--- a/paddle/fluid/operators/expand_op.cc
+++ b/paddle/fluid/operators/expand_op.cc
@@ -283,7 +283,7 @@ REGISTER_OP_CPU_KERNEL(expand_grad,
                        ops::ExpandGradKernel<phi::CPUContext, double>,
                        ops::ExpandGradKernel<phi::CPUContext, int>,
                        ops::ExpandGradKernel<phi::CPUContext, int64_t>);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 REGISTER_OP_CUDA_KERNEL(
     expand,
     ops::ExpandKernel<phi::GPUContext, float>,
diff --git a/paddle/fluid/operators/fake_quantize_op.cu.h b/paddle/fluid/operators/fake_quantize_op.cu.h
index b6dd3ca8f64b2..1bedf6cc54a4e 100644
--- a/paddle/fluid/operators/fake_quantize_op.cu.h
+++ b/paddle/fluid/operators/fake_quantize_op.cu.h
@@ -190,8 +190,10 @@ struct FindChannelAbsMaxFunctor<phi::GPUContext, T> {
       int grid = cout;
       int max_threads = 1024;
 
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
       hipMemset(out_abs_max, 0, sizeof(T) * cout);
+#elif defined(PADDLE_WITH_MUSA)
+      musaMemset(out_abs_max, 0, sizeof(T) * cout);
 #else
       cudaMemset(out_abs_max, 0, sizeof(T) * cout);
 #endif  // PADDLE_FLUID_OPERATORS_FAKE_QUANTIZE_OP_CU_H_
diff --git a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
index 35574331e17d7..0216564ed80a4 100644
--- a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
+++ b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
@@ -44,8 +44,10 @@ class EmbeddingEltWiseLayerNormKernel : public framework::OpKernel<T> {
             framework::TransToPhiDataType(framework::proto::VarType::INT64));
     framework::DDim in_dim{input_num};
     int device_id;
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
     hipGetDevice(&device_id);
+#elif defined(PADDLE_WITH_MUSA)
+    musaGetDevice(&device_id);
 #else
     cudaGetDevice(&device_id);
 #endif
@@ -65,7 +67,7 @@ class EmbeddingEltWiseLayerNormKernel : public framework::OpKernel<T> {
       in1s.push_back(reinterpret_cast<uintptr_t>(ids[i]->data<int64_t>()));
       in2s.push_back(reinterpret_cast<uintptr_t>(embs[i]->data<T>()));
     }
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
     hipMemcpyAsync(in_ids_d,
                    in1s.data(),
                    sizeof(int64_t) * input_num,
@@ -76,6 +78,17 @@ class EmbeddingEltWiseLayerNormKernel : public framework::OpKernel<T> {
                    sizeof(int64_t) * input_num,
                    hipMemcpyHostToDevice,
                    device_ctx.stream());
+#elif defined(PADDLE_WITH_MUSA)
+    musaMemcpyAsync(in_ids_d,
+                    in1s.data(),
+                    sizeof(int64_t) * input_num,
+                    musaMemcpyHostToDevice,
+                    device_ctx.stream());
+    musaMemcpyAsync(in_embs_d,
+                    in2s.data(),
+                    sizeof(int64_t) * input_num,
+                    musaMemcpyHostToDevice,
+                    device_ctx.stream());
 #else
     cudaMemcpyAsync(in_ids_d,
                     in1s.data(),
diff --git a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu
index 32e7cffa4984b..35d69faa1a41d 100644
--- a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu
+++ b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu
@@ -31,6 +31,10 @@ limitations under the License. */
 #include <cuda.h>
 #include <curand_kernel.h>
 #endif
+#ifdef PADDLE_WITH_MUSA
+#include <musa.h>
+#include <murand_kernel.h>
+#endif
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #include <hiprand_kernel.h>
diff --git a/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc
index 8ae92b04b7df4..c6a8a4fe7b982 100644
--- a/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc
+++ b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc
@@ -111,7 +111,7 @@ PD_REGISTER_STRUCT_KERNEL(get_tensor_from_selected_rows,
                           int,
                           int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_STRUCT_KERNEL(get_tensor_from_selected_rows,
                           GPU,
                           ALL_LAYOUT,
diff --git a/paddle/fluid/operators/graph_khop_sampler_op.cu b/paddle/fluid/operators/graph_khop_sampler_op.cu
index e533960c8a648..7c9d1f3c921f7 100644
--- a/paddle/fluid/operators/graph_khop_sampler_op.cu
+++ b/paddle/fluid/operators/graph_khop_sampler_op.cu
@@ -29,9 +29,12 @@ limitations under the License. */
 
 #include <ostream>
 
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
 #include <hip/hip_runtime.h>
 #include <hiprand_kernel.h>
+#elif defined(PADDLE_WITH_MUSA)
+#include <musa_runtime.h>
+#include <murand_kernel.h>
 #else
 #include <cuda_runtime.h>
 #include <curand_kernel.h>
@@ -89,12 +92,18 @@ __global__ void GraphSampleNeighborsCUDAKernel(const uint64_t rand_seed,
   int64_t out_row = blockIdx.x * TILE_SIZE + threadIdx.y;
   const int64_t last_row =
       min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
   hiprandState rng;
   hiprand_init(rand_seed * gridDim.x + blockIdx.x,
                threadIdx.y * WARP_SIZE + threadIdx.x,
                0,
                &rng);
+#elif defined(PADDLE_WITH_MUSA)
+  murandState rng;
+  murand_init(rand_seed * gridDim.x + blockIdx.x,
+              threadIdx.y * WARP_SIZE + threadIdx.x,
+              0,
+              &rng);
 #else
   curandState rng;
   curand_init(rand_seed * gridDim.x + blockIdx.x,
@@ -126,8 +135,10 @@ __global__ void GraphSampleNeighborsCUDAKernel(const uint64_t rand_seed,
 #endif
 
       for (int idx = k + threadIdx.x; idx < deg; idx += WARP_SIZE) {
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
         const int num = hiprand(&rng) % (idx + 1);
+#elif defined(PADDLE_WITH_MUSA)
+        const int num = murand(&rng) % (idx + 1);
 #else
         const int num = curand(&rng) % (idx + 1);
 #endif
diff --git a/paddle/fluid/operators/hinge_loss_op.cc b/paddle/fluid/operators/hinge_loss_op.cc
index dea3ce3fe695b..ea38db87e63e7 100644
--- a/paddle/fluid/operators/hinge_loss_op.cc
+++ b/paddle/fluid/operators/hinge_loss_op.cc
@@ -156,7 +156,7 @@ PD_REGISTER_STRUCT_KERNEL(
 PD_REGISTER_STRUCT_KERNEL(
     hinge_loss_grad, CPU, ALL_LAYOUT, ops::HingeLossGradKernel, float) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_STRUCT_KERNEL(
     hinge_loss, GPU, ALL_LAYOUT, ops::HingeLossKernel, float) {}
 PD_REGISTER_STRUCT_KERNEL(
diff --git a/paddle/fluid/operators/im2sequence_op.cc b/paddle/fluid/operators/im2sequence_op.cc
index 8c123bb8a32f2..e1e9ca5ef6667 100644
--- a/paddle/fluid/operators/im2sequence_op.cc
+++ b/paddle/fluid/operators/im2sequence_op.cc
@@ -201,7 +201,7 @@ PD_REGISTER_STRUCT_KERNEL(
 PD_REGISTER_STRUCT_KERNEL(
     im2sequence_grad, CPU, ALL_LAYOUT, ops::Im2SequenceGradKernel, float) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_STRUCT_KERNEL(
     im2sequence, GPU, ALL_LAYOUT, ops::Im2SequenceKernel, float) {}
 PD_REGISTER_STRUCT_KERNEL(
diff --git a/paddle/fluid/operators/isfinite_op.h b/paddle/fluid/operators/isfinite_op.h
index aab7953d6d103..940b3eaac0c10 100644
--- a/paddle/fluid/operators/isfinite_op.h
+++ b/paddle/fluid/operators/isfinite_op.h
@@ -67,7 +67,7 @@ bool TensorIsfinite(const phi::DenseTensor& tensor);
 FiniteVisitor(Isnan, Any, CPU);
 FiniteVisitor(Isinf, Any, CPU);
 FiniteVisitor(Isfinite, All, CPU);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 FiniteVisitor(Isnan, Any, GPU);
 FiniteVisitor(Isinf, Any, GPU);
 FiniteVisitor(Isfinite, All, GPU);
@@ -82,7 +82,7 @@ inline void TensorContainsNAN(const phi::DenseTensor& tensor,
                         IsnanVisitorCPU(tensor, out));
     return;
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (platform::is_gpu_place(place)) {
     VisitDataTypeNormal(paddle::framework::TransToProtoVarType(tensor.dtype()),
                         IsnanVisitorGPU(tensor, out));
@@ -99,7 +99,7 @@ inline void TensorContainsInf(const phi::DenseTensor& tensor,
                         IsinfVisitorCPU(tensor, out));
     return;
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (platform::is_gpu_place(place)) {
     VisitDataTypeNormal(paddle::framework::TransToProtoVarType(tensor.dtype()),
                         IsinfVisitorGPU(tensor, out));
@@ -116,7 +116,7 @@ inline void TensorIsfinite(const phi::DenseTensor& tensor,
                         IsfiniteVisitorCPU(tensor, out));
     return;
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (platform::is_gpu_place(place)) {
     VisitDataTypeNormal(paddle::framework::TransToProtoVarType(tensor.dtype()),
                         IsfiniteVisitorGPU(tensor, out));
diff --git a/paddle/fluid/operators/l1_norm_op.cc b/paddle/fluid/operators/l1_norm_op.cc
index 92f190c0025ed..2c6d72f109c13 100644
--- a/paddle/fluid/operators/l1_norm_op.cc
+++ b/paddle/fluid/operators/l1_norm_op.cc
@@ -96,7 +96,7 @@ PD_REGISTER_STRUCT_KERNEL(l1_norm, CPU, ALL_LAYOUT, ops::L1NormKernel, float) {}
 PD_REGISTER_STRUCT_KERNEL(
     l1_norm_grad, CPU, ALL_LAYOUT, ops::L1NormGradKernel, float) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_STRUCT_KERNEL(l1_norm, GPU, ALL_LAYOUT, ops::L1NormKernel, float) {}
 PD_REGISTER_STRUCT_KERNEL(
     l1_norm_grad, GPU, ALL_LAYOUT, ops::L1NormGradKernel, float) {}
diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc
index dd85ccff87f2d..197aaa74bb3e1 100644
--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
@@ -133,7 +133,7 @@ PD_REGISTER_KERNEL(load, CPU, ALL_LAYOUT, ops::LoadKernel, float) {}
 PD_REGISTER_KERNEL(
     load_sr, CPU, ALL_LAYOUT, ops::LoadSelectedRowsKernel, float) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(load, GPU, ALL_LAYOUT, ops::LoadKernel, float) {}
 PD_REGISTER_KERNEL(
     load_sr, GPU, ALL_LAYOUT, ops::LoadSelectedRowsKernel, float) {}
diff --git a/paddle/fluid/operators/lod_tensor_to_array_op.cc b/paddle/fluid/operators/lod_tensor_to_array_op.cc
index 94b0319729117..da8ea875e9393 100644
--- a/paddle/fluid/operators/lod_tensor_to_array_op.cc
+++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc
@@ -66,7 +66,7 @@ struct LoDTensorToArrayFunctor {
     if (std::is_same<Place, platform::CPUPlace>::value) {
       Apply(static_cast<phi::CPUContext *>(dev_ctx));
     } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       Apply(static_cast<phi::GPUContext *>(dev_ctx));
 #else
       PADDLE_THROW(
diff --git a/paddle/fluid/operators/lookup_table_v2_op.cu b/paddle/fluid/operators/lookup_table_v2_op.cu
index 11c35293ebe34..c627b1cf89dcd 100644
--- a/paddle/fluid/operators/lookup_table_v2_op.cu
+++ b/paddle/fluid/operators/lookup_table_v2_op.cu
@@ -218,9 +218,12 @@ struct LookupTableV2GradCUDAFunctor {
       const auto *ids = ids_t_->template data<IdT>();
       T *d_table = d_table_t->mutable_data<T>(context_.GetPlace());
 
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx.stream()));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          musaMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx.stream()));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx.stream()));
diff --git a/paddle/fluid/operators/margin_cross_entropy_op.cu b/paddle/fluid/operators/margin_cross_entropy_op.cu
index d741bc5b42549..40231fd4bf2c4 100644
--- a/paddle/fluid/operators/margin_cross_entropy_op.cu
+++ b/paddle/fluid/operators/margin_cross_entropy_op.cu
@@ -13,9 +13,11 @@
 // limitations under the License.
 
 // old op include, fluid should be removed
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
+#if defined(PADDLE_WITH_MUSA)
+#include <cub/cub.cuh>
 #else
 #include <cub/cub.cuh>
 #endif
diff --git a/paddle/fluid/operators/math/bert_encoder_functor.h b/paddle/fluid/operators/math/bert_encoder_functor.h
index 9a0b5a1ae3ab7..a9869e5faecce 100644
--- a/paddle/fluid/operators/math/bert_encoder_functor.h
+++ b/paddle/fluid/operators/math/bert_encoder_functor.h
@@ -47,7 +47,7 @@ struct CUDATypeTraits<float> {
   typedef float TYPE;
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 // This functor involves a fusion calculation in Ernie or Bert.
 //  The fusion mode is as follows:
 //
diff --git a/paddle/fluid/operators/math/prelu.h b/paddle/fluid/operators/math/prelu.h
index 00ff1fbcbc38d..1762353abaa9f 100644
--- a/paddle/fluid/operators/math/prelu.h
+++ b/paddle/fluid/operators/math/prelu.h
@@ -23,7 +23,7 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 template <typename T>
 class PreluChannelWiseDirectCUDAFunctor {
  public:
diff --git a/paddle/fluid/operators/math/sample_prob.h b/paddle/fluid/operators/math/sample_prob.h
index 7c60be6841552..b7b224a0baaf5 100644
--- a/paddle/fluid/operators/math/sample_prob.h
+++ b/paddle/fluid/operators/math/sample_prob.h
@@ -106,7 +106,7 @@ class SampleWithProb {
   }
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 template <typename T>
 class GPUSampleWithProb {
  public:
diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc
index e1a36fa41894d..5208d0b2cf937 100644
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -926,7 +926,7 @@ REGISTER_OP_CPU_KERNEL(matmul_grad_grad,
                        ops::MatMulDoubleGradKernel<phi::CPUContext, float>,
                        ops::MatMulDoubleGradKernel<phi::CPUContext, double>);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 REGISTER_OP_CUDA_KERNEL(
     matmul,
     ops::MatMulKernel<phi::GPUContext, float>,
diff --git a/paddle/fluid/operators/memcpy_h2d_op.h b/paddle/fluid/operators/memcpy_h2d_op.h
index 5f480461d77cd..a4b6e061bfdff 100644
--- a/paddle/fluid/operators/memcpy_h2d_op.h
+++ b/paddle/fluid/operators/memcpy_h2d_op.h
@@ -39,7 +39,7 @@ class MemcpyH2DFunctor {
 
   void operator()(const phi::DenseTensor &lod_tensor) const {
     auto &out_tensor = *out_->GetMutable<phi::DenseTensor>();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     auto stream = static_cast<const phi::GPUContext *>(&dev_ctx_)->stream();
 #else
     auto stream = nullptr;
diff --git a/paddle/fluid/operators/merge_lod_tensor_op.cc b/paddle/fluid/operators/merge_lod_tensor_op.cc
index 007f853f3243f..20775d02aadfe 100644
--- a/paddle/fluid/operators/merge_lod_tensor_op.cc
+++ b/paddle/fluid/operators/merge_lod_tensor_op.cc
@@ -68,7 +68,7 @@ class MergeLoDTensorOp : public framework::OperatorBase {
     if (platform::is_cpu_place(mask.place())) {
       cpu_mask->ShareDataWith(mask);
     } else if (platform::is_gpu_place(mask.place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       framework::TensorCopy(
           mask, platform::CPUPlace(), dev_ctx, cpu_mask.get());
 #else
diff --git a/paddle/fluid/operators/minus_op.cc b/paddle/fluid/operators/minus_op.cc
index 8c33a5da1baff..27a38571e1c80 100644
--- a/paddle/fluid/operators/minus_op.cc
+++ b/paddle/fluid/operators/minus_op.cc
@@ -157,6 +157,6 @@ REGISTER_OPERATOR(minus,
                   ops::MinusGradMaker);
 PD_REGISTER_STRUCT_KERNEL(minus, CPU, ALL_LAYOUT, ops::MinusKernel, float) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_STRUCT_KERNEL(minus, GPU, ALL_LAYOUT, ops::MinusKernel, float) {}
 #endif
diff --git a/paddle/fluid/operators/nop_op.cc b/paddle/fluid/operators/nop_op.cc
index 69f0bfb2abcd3..e99b3956d05b0 100644
--- a/paddle/fluid/operators/nop_op.cc
+++ b/paddle/fluid/operators/nop_op.cc
@@ -60,6 +60,6 @@ REGISTER_OP_WITHOUT_GRADIENT(nop, ops::NopOp, ops::NopOpMaker);
 
 PD_REGISTER_STRUCT_KERNEL(nop, CPU, ALL_LAYOUT, ops::NopKernel, float) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_STRUCT_KERNEL(nop, GPU, ALL_LAYOUT, ops::NopKernel, float) {}
 #endif
diff --git a/paddle/fluid/operators/pad_constant_like_op.cc b/paddle/fluid/operators/pad_constant_like_op.cc
index d00cefab45045..72061fbc39630 100644
--- a/paddle/fluid/operators/pad_constant_like_op.cc
+++ b/paddle/fluid/operators/pad_constant_like_op.cc
@@ -260,7 +260,7 @@ PD_REGISTER_STRUCT_KERNEL(pad_constant_like_grad,
                           int,
                           int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_STRUCT_KERNEL(pad_constant_like,
                           GPU,
                           ALL_LAYOUT,
diff --git a/paddle/fluid/operators/pscore/send_and_recv_op.cc b/paddle/fluid/operators/pscore/send_and_recv_op.cc
index 99e8d04a9e329..49623bb0ec206 100644
--- a/paddle/fluid/operators/pscore/send_and_recv_op.cc
+++ b/paddle/fluid/operators/pscore/send_and_recv_op.cc
@@ -107,7 +107,7 @@ PD_REGISTER_STRUCT_KERNEL(send_and_recv,
                           double,
                           int,
                           int64_t) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_STRUCT_KERNEL(send_and_recv,
                           GPU,
                           ALL_LAYOUT,
diff --git a/paddle/fluid/operators/random_crop_op.h b/paddle/fluid/operators/random_crop_op.h
index fc625826b9a91..de03079b23035 100644
--- a/paddle/fluid/operators/random_crop_op.h
+++ b/paddle/fluid/operators/random_crop_op.h
@@ -19,7 +19,7 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/for_range.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include <thrust/random.h>
 #endif
 
@@ -37,7 +37,7 @@ struct Random<phi::CPUContext> {
   using UniformIntDist = std::uniform_int_distribution<T>;
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 template <>
 struct Random<phi::GPUContext> {
   using Engine = thrust::minstd_rand;
diff --git a/paddle/fluid/operators/rank_loss_op.cc b/paddle/fluid/operators/rank_loss_op.cc
index ebdddfd41b33f..b9f05d663dba0 100644
--- a/paddle/fluid/operators/rank_loss_op.cc
+++ b/paddle/fluid/operators/rank_loss_op.cc
@@ -246,7 +246,7 @@ PD_REGISTER_STRUCT_KERNEL(
 PD_REGISTER_STRUCT_KERNEL(
     rank_loss_grad, CPU, ALL_LAYOUT, ops::RankLossGradKernel, float) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_STRUCT_KERNEL(
     rank_loss, GPU, ALL_LAYOUT, ops::RankLossKernel, float) {}
 PD_REGISTER_STRUCT_KERNEL(
diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc
index a0ad7e3939a02..73b3823d3e5ab 100644
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -48,7 +48,7 @@ BufferedReader::BufferedReader(
       buffer_size_(buffer_size),
       pin_memory_(pin_memory) {
   VLOG(1) << "BufferedReader";
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (platform::is_gpu_place(place_) && !pin_memory) {
     int dev_idx = place_.device;
     compute_stream_ =
@@ -118,7 +118,7 @@ void BufferedReader::ReadAsync(size_t i) {
       return -1UL;
     }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)  // @{ Group GPU Place
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)  // @{ Group GPU Place
     if (platform::is_gpu_place(place_)) {
       TensorVec &cuda = cuda_buffer_[i];
       if (cuda.empty()) {
diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h
index 032a74b7e23f1..db849dc70b5da 100644
--- a/paddle/fluid/operators/reader/buffered_reader.h
+++ b/paddle/fluid/operators/reader/buffered_reader.h
@@ -21,7 +21,7 @@
 
 #include "ThreadPool.h"
 #include "paddle/fluid/framework/reader.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h"
 #endif
@@ -80,7 +80,7 @@ class BufferedReader : public framework::DecoratedReader {
   std::vector<TensorVec> xpu_buffer_;
   std::vector<TensorVec> custom_device_buffer_;
   size_t prev_pos_{-1UL};
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   gpuStream_t compute_stream_;
   std::shared_ptr<platform::CudaStreamObject> stream_;
   std::vector<std::shared_ptr<platform::CudaEventObject>> events_;
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 962b18c995979..a089ad7d58fac 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -429,7 +429,7 @@ class ReshapeKernel {
                               pt_scalar_shape,
                               out);
     }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     if (platform::is_gpu_place(ctx.GetPlace())) {
       auto &dev_ctx = ctx.device_context<phi::GPUContext>();
       phi::ReshapeInferKernel(static_cast<const phi::GPUContext &>(dev_ctx),
@@ -462,7 +462,7 @@ class ReshapeGradKernel {
       phi::ReshapeGradKernel(
           static_cast<const phi::CPUContext &>(dev_ctx), *d_out, d_x);
     }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     if (platform::is_gpu_place(ctx.GetPlace())) {
       auto &dev_ctx = ctx.device_context<phi::GPUContext>();
       phi::ReshapeGradKernel(
@@ -492,7 +492,7 @@ class ReshapeDoubleGradKernel {
       phi::ReshapeDoubleGradKernel(
           static_cast<const phi::CPUContext &>(dev_ctx), *d_out, *dd_x, dd_out);
     }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     if (platform::is_gpu_place(ctx.GetPlace())) {
       auto &dev_ctx = ctx.device_context<phi::GPUContext>();
       phi::ReshapeDoubleGradKernel(
@@ -761,7 +761,7 @@ REGISTER_OPERATOR(reshape2_grad_grad,
                   ops::ReshapeDoubleGradOpNoNeedBufferVarInferer,
                   Reshape2DoubleGradInferShapeFunctor);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape,
                                 float,
                                 ops::ReshapeKernel,
diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc
index bc1f5a0d34f60..ab03d46486c2e 100644
--- a/paddle/fluid/operators/save_op.cc
+++ b/paddle/fluid/operators/save_op.cc
@@ -117,7 +117,7 @@ PD_REGISTER_KERNEL(save_sr,
                    phi::dtype::float16,
                    phi::dtype::bfloat16) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(save,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/fluid/operators/select_op_helper.h b/paddle/fluid/operators/select_op_helper.h
index 2b7f884f6170c..7e3de57345a4b 100644
--- a/paddle/fluid/operators/select_op_helper.h
+++ b/paddle/fluid/operators/select_op_helper.h
@@ -39,7 +39,7 @@ inline int GetBranchNumber(const phi::DenseTensor &mask) {
   }
   // when platform::is_gpu_place(mask.place()) is true
   std::unique_ptr<phi::DenseTensor> cpu_mask{new phi::DenseTensor()};
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
     defined(PADDLE_WITH_CUSTOM_DEVICE) || defined(PADDLE_WITH_XPU)
   framework::TensorCopySync(mask, platform::CPUPlace(), cpu_mask.get());
 #else
diff --git a/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h b/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h
index 2236988025cbc..13133e54f0415 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h
@@ -136,7 +136,7 @@ class SequenceReverseOpKernel : public framework::OpKernel<T> {
     const size_t *lod;
     size_t lod_count = x.lod()[0].size();
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     if (platform::is_gpu_place(ctx.GetPlace())) {
       auto xlod = x.lod()[0];
       phi::MixVector<size_t> mixv_xlod(&xlod);
@@ -144,7 +144,7 @@ class SequenceReverseOpKernel : public framework::OpKernel<T> {
     } else {
 #endif
       lod = x.lod()[0].data();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     }
 #endif
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc
index 0ca5514900d46..e3af25c4b57f9 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <string>
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #endif
 
diff --git a/paddle/fluid/operators/shuffle_batch_op.cu b/paddle/fluid/operators/shuffle_batch_op.cu
index 5069cf1e512cb..7c8c6ca475b38 100644
--- a/paddle/fluid/operators/shuffle_batch_op.cu
+++ b/paddle/fluid/operators/shuffle_batch_op.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 #ifndef _MSC_VER
 #include <thrust/device_ptr.h>
diff --git a/paddle/fluid/operators/split_lod_tensor_op.cc b/paddle/fluid/operators/split_lod_tensor_op.cc
index e648575a1edca..607ea43f50105 100644
--- a/paddle/fluid/operators/split_lod_tensor_op.cc
+++ b/paddle/fluid/operators/split_lod_tensor_op.cc
@@ -69,7 +69,7 @@ class SplitLoDTensorOp : public framework::OperatorBase {
     if (platform::is_cpu_place(mask.place())) {
       cpu_mask->ShareDataWith(mask);
     } else if (platform::is_gpu_place(mask.place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       framework::TensorCopy(
           mask, platform::CPUPlace(), dev_ctx, cpu_mask.get());
 #else
diff --git a/paddle/fluid/operators/sync_batch_norm_op.cu b/paddle/fluid/operators/sync_batch_norm_op.cu
index 84e30250f85fd..fda42c80fbbf6 100644
--- a/paddle/fluid/operators/sync_batch_norm_op.cu
+++ b/paddle/fluid/operators/sync_batch_norm_op.cu
@@ -302,7 +302,25 @@ void SyncBatchNormCooGradKernel(
 }  // namespace sparse
 }  // namespace phi
 
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
+PD_REGISTER_KERNEL(sync_batch_norm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SyncBatchNormKernel,
+                   float,
+                   phi::dtype::float16) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16) {
+    kernel->InputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->InputAt(2).SetDataType(phi::DataType::FLOAT32);
+    kernel->InputAt(3).SetDataType(phi::DataType::FLOAT32);
+    kernel->InputAt(4).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
+  }
+}
+#elif defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(sync_batch_norm,
                    GPU,
                    ALL_LAYOUT,
@@ -364,7 +382,19 @@ PD_REGISTER_KERNEL(sync_batch_norm,
 #endif
 #endif
 
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
+PD_REGISTER_KERNEL(sync_batch_norm_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SyncBatchNormGradKernel,
+                   float,
+                   phi::dtype::float16) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16) {
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);  // scale_grad
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);  // bias_grad
+  }
+}
+#elif defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(sync_batch_norm_grad,
                    GPU,
                    ALL_LAYOUT,
@@ -397,13 +427,19 @@ PD_REGISTER_KERNEL(sync_batch_norm_grad,
 #endif
 #endif
 
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(sync_batch_norm_coo,
                    GPU,
                    ALL_LAYOUT,
                    phi::sparse::SyncBatchNormCooKernel,
                    float,
                    phi::dtype::float16) {}
+#elif defined(PADDLE_WITH_MUSA)
+PD_REGISTER_KERNEL(sync_batch_norm_coo,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SyncBatchNormCooKernel,
+                   float,
 #else
 PD_REGISTER_KERNEL(sync_batch_norm_coo,
                    GPU,
@@ -414,13 +450,19 @@ PD_REGISTER_KERNEL(sync_batch_norm_coo,
                    phi::dtype::float16) {}
 #endif
 
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(sync_batch_norm_coo_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::sparse::SyncBatchNormCooGradKernel,
                    float,
                    phi::dtype::float16) {}
+#elif defined(PADDLE_WITH_MUSA)
+PD_REGISTER_KERNEL(sync_batch_norm_coo_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SyncBatchNormCooGradKernel,
+                   float,
 #else
 PD_REGISTER_KERNEL(sync_batch_norm_coo_grad,
                    GPU,
diff --git a/paddle/fluid/platform/complex_test.cu b/paddle/fluid/platform/complex_test.cu
index b814bcde6841f..effccd3cce75e 100644
--- a/paddle/fluid/platform/complex_test.cu
+++ b/paddle/fluid/platform/complex_test.cu
@@ -27,7 +27,7 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/phi/kernels/funcs/eigen/extensions.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 namespace paddle {
 namespace platform {
 
diff --git a/paddle/fluid/platform/device/device_wrapper.h b/paddle/fluid/platform/device/device_wrapper.h
index aa2dba03c9082..bcfb316837a30 100644
--- a/paddle/fluid/platform/device/device_wrapper.h
+++ b/paddle/fluid/platform/device/device_wrapper.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #pragma once
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
 
diff --git a/paddle/fluid/platform/device/gpu/gpu_dnn.h b/paddle/fluid/platform/device/gpu/gpu_dnn.h
index f6f6392c4c23d..2a9db61f83bc6 100644
--- a/paddle/fluid/platform/device/gpu/gpu_dnn.h
+++ b/paddle/fluid/platform/device/gpu/gpu_dnn.h
@@ -16,7 +16,7 @@
 
 #include "paddle/phi/backends/gpu/gpu_dnn.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/device/gpu/gpu_helper.h b/paddle/fluid/platform/device/gpu/gpu_helper.h
index 878a122a49224..7fde4429bb7f3 100644
--- a/paddle/fluid/platform/device/gpu/gpu_helper.h
+++ b/paddle/fluid/platform/device/gpu/gpu_helper.h
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #pragma once
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 #ifdef PADDLE_WITH_HIP
 #include "paddle/fluid/platform/device/gpu/rocm/rocm_helper.h"
diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc
index 7f1f2c76bd630..94c85105115d6 100644
--- a/paddle/fluid/platform/device/gpu/gpu_info.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_info.cc
@@ -216,6 +216,8 @@ class RecordedGpuMallocHelper {
     } else {
       result = hipMalloc(ptr, size);
     }
+#elif defined(PADDLE_WITH_MUSA)
+    result = musaMalloc(ptr, size);
 #else
     phi::backends::gpu::CUDAGraphCaptureModeGuard capture_mode_guard;
     if (UNLIKELY(malloc_managed_memory)) {
@@ -262,6 +264,9 @@ class RecordedGpuMallocHelper {
 #ifdef PADDLE_WITH_HIP
     auto err = hipFree(ptr);
     if (err != hipErrorDeinitialized) {
+#elif define(PADDLE_WITH_MUSA)
+    auto err = musaFree(ptr);
+    if (err != musaErrorMusaUnloading) {
 #else
     auto err = cudaFree(ptr);
     VLOG(10) << "[cudaFree] size=" << static_cast<double>(size) / (1 << 20)
@@ -309,6 +314,8 @@ class RecordedGpuMallocHelper {
       CUDADeviceGuard guard(dev_id_);
 #ifdef PADDLE_WITH_HIP
       auto result = hipMemGetInfo(actual_avail, actual_total);
+#elif define(PADDLE_WITH_MUSA)
+      auto result = musaMemGetInfo(actual_avail, actual_total);
 #else
       auto result = cudaMemGetInfo(actual_avail, actual_total);
 #endif
diff --git a/paddle/fluid/platform/device/gpu/gpu_info.h b/paddle/fluid/platform/device/gpu/gpu_info.h
index de68329bba66d..64cb1bd8fcab7 100644
--- a/paddle/fluid/platform/device/gpu/gpu_info.h
+++ b/paddle/fluid/platform/device/gpu/gpu_info.h
@@ -11,7 +11,7 @@ limitations under the License. */
 
 #pragma once
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 #include <stddef.h>
 
diff --git a/paddle/fluid/platform/device/gpu/gpu_launch_config.h b/paddle/fluid/platform/device/gpu/gpu_launch_config.h
index d253a92c986ce..8ce858b4d37a1 100644
--- a/paddle/fluid/platform/device/gpu/gpu_launch_config.h
+++ b/paddle/fluid/platform/device/gpu/gpu_launch_config.h
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 #ifdef PADDLE_WITH_CUDA
 #include <cuda_runtime.h>
diff --git a/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc b/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc
index 9f2168e1cdb8b..ee60040f09074 100644
--- a/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h"
 
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
diff --git a/paddle/fluid/platform/device/gpu/gpu_resource_pool.h b/paddle/fluid/platform/device/gpu/gpu_resource_pool.h
index 2ac13e692f783..ff1452153e7bd 100644
--- a/paddle/fluid/platform/device/gpu/gpu_resource_pool.h
+++ b/paddle/fluid/platform/device/gpu/gpu_resource_pool.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
diff --git a/paddle/fluid/platform/device/gpu/gpu_types.h b/paddle/fluid/platform/device/gpu/gpu_types.h
index c9afafdef7166..83497a2507005 100644
--- a/paddle/fluid/platform/device/gpu/gpu_types.h
+++ b/paddle/fluid/platform/device/gpu/gpu_types.h
@@ -15,7 +15,7 @@
 
 #pragma once
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
@@ -95,6 +95,9 @@ using CUDAGraphID = unsigned long long;  // NOLINT
 #ifdef PADDLE_WITH_HIP
 #define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \
   constexpr auto GPU_CV = ROCM_CV;
+#elif PADDLE_WITH_MUSA
+#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \
+  constexpr auto GPU_CV = MUSA_CV;
 #else  // CDUA
 
 #define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \
@@ -103,9 +106,10 @@ using CUDAGraphID = unsigned long long;  // NOLINT
 
 DECLARE_CONSTANT_FOR_GPU(gpuErrorOutOfMemory,
                          cudaErrorMemoryAllocation,
-                         hipErrorOutOfMemory);
-DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady, cudaErrorNotReady, hipErrorNotReady);
-DECLARE_CONSTANT_FOR_GPU(gpuSuccess, cudaSuccess, hipSuccess);
+                         hipErrorOutOfMemory,
+                         musaErrorMemoryAllocation);
+DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady, cudaErrorNotReady, hipErrorNotReady, musaErrorNotReady);
+DECLARE_CONSTANT_FOR_GPU(gpuSuccess, cudaSuccess, hipSuccess, musaSuccess);
 
 #undef DECLARE_CONSTANT_FOR_GPU
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device_code_test.cc b/paddle/fluid/platform/device_code_test.cc
index 6b58453f03ea8..a4a810b34c3f0 100644
--- a/paddle/fluid/platform/device_code_test.cc
+++ b/paddle/fluid/platform/device_code_test.cc
@@ -45,7 +45,7 @@ void saxpy_kernel(float a, float *x, float* y, float* z, size_t n) {
 )";
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 TEST(DeviceCode, cuda) {
   if (!phi::dynload::HasNVRTC() || !phi::dynload::HasCUDADriver()) {
     return;
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 456abd55ef68f..4a81291815373 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -27,7 +27,7 @@ limitations under the License. */
 #include "paddle/phi/core/expect.h"
 #include "paddle/phi/core/generator.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
@@ -53,7 +53,7 @@ DeviceType Place2DeviceType(const platform::Place& place) {
   }
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 template <typename DevCtx>
 typename std::enable_if<!std::is_same<DevCtx, phi::GPUContext>::value,
                         DevCtx*>::type
@@ -86,7 +86,7 @@ inline std::unique_ptr<DeviceContext> CreateDeviceContext(
   DevCtx* dev_ctx = ConstructDevCtx<DevCtx>(p, stream_priority);
   auto& instance = paddle::memory::allocation::AllocatorFacade::Instance();
   if (p.GetType() == phi::AllocationType::GPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     auto* cuda_ctx = dynamic_cast<phi::GPUContext*>(dev_ctx);
     PADDLE_ENFORCE_NOT_NULL(
         cuda_ctx,
@@ -172,7 +172,7 @@ void EmplaceDeviceContexts(
           /*unused*/ stream_priority);
 #endif
     } else if (place.GetType() == phi::AllocationType::GPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       EmplaceDeviceContext<phi::GPUContext>(
           place_to_device_context,
           place,
@@ -209,7 +209,7 @@ void EmplaceDeviceContexts(
           "option."));
 #endif
     } else if (platform::is_cuda_pinned_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       EmplaceDeviceContext<CUDAPinnedDeviceContext>(
           place_to_device_context,
           place,
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index b07b3f29dafde..2aa336486308d 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -136,7 +136,7 @@ namespace xpu = baidu::xpu::api;
 using XPUDeviceContext = phi::XPUContext;
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 using CUDAPinnedDeviceContext = phi::GPUPinnedContext;
 #endif
 
@@ -165,7 +165,7 @@ struct DefaultDeviceContextType<phi::IPUPlace> {
 };
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 template <>
 struct DefaultDeviceContextType<phi::GPUPinnedPlace> {
   using TYPE = paddle::platform::CUDAPinnedDeviceContext;
diff --git a/paddle/fluid/platform/device_event.h b/paddle/fluid/platform/device_event.h
index 402974b89e5c9..cb43f00f7fe0f 100644
--- a/paddle/fluid/platform/device_event.h
+++ b/paddle/fluid/platform/device_event.h
@@ -31,7 +31,7 @@ using ::paddle::platform::kXPU;
 USE_EVENT(kCPU)
 USE_EVENT_WAIT(kCPU, kCPU)
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 USE_EVENT(kCUDA);
 USE_EVENT_WAIT(kCUDA, kCUDA)
 USE_EVENT_WAIT(kCPU, kCUDA)
diff --git a/paddle/fluid/platform/device_event_gpu.cc b/paddle/fluid/platform/device_event_gpu.cc
index 37da8daf7fd69..09861f41874cd 100644
--- a/paddle/fluid/platform/device_event_gpu.cc
+++ b/paddle/fluid/platform/device_event_gpu.cc
@@ -15,7 +15,7 @@
 #include "paddle/fluid/platform/device_event_base.h"
 #include "paddle/fluid/platform/event.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 namespace paddle {
 namespace platform {
 struct CUDADeviceEventWrapper {
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index 425d4939b565f..105c5f0607f69 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -98,7 +98,7 @@ limitations under the License. */
 #include "paddle/fluid/imperative/type_defs.h"
 #include "paddle/phi/core/enforce.h"
 // Note: this header for simplify HIP and CUDA type string
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/device/gpu/gpu_types.h"
 #endif
 #include "paddle/phi/core/flags.h"
diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc
index 9fc200ca82f1c..ef435721b93a0 100644
--- a/paddle/fluid/platform/enforce_test.cc
+++ b/paddle/fluid/platform/enforce_test.cc
@@ -345,7 +345,7 @@ TEST(EOF_EXCEPTION, THROW_EOF) {
   EXPECT_TRUE(caught_eof);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 template <typename T>
 bool CheckCudaStatusSuccess(T value, const std::string& msg = "success") {
   PADDLE_ENFORCE_GPU_SUCCESS(value);
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index b5f31fd85847c..fad64a6290486 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/string/split.h"
 #include "paddle/phi/backends/cpu/cpu_info.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
@@ -172,7 +172,7 @@ void InitDevices() {
 #endif
     /*Init all available devices by default */
     std::vector<int> devices;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     try {
       // use user specified GPUs in single-node multi-process mode.
       devices = platform::GetSelectedDevices();
@@ -215,7 +215,7 @@ void InitDevices(const std::vector<int> devices) {
       continue;
     }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     places.emplace_back(platform::CUDAPlace(devices[i]));
 #endif
 #ifdef PADDLE_WITH_XPU
@@ -226,7 +226,7 @@ void InitDevices(const std::vector<int> devices) {
 #endif
   }
   places.emplace_back(platform::CPUPlace());
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   places.emplace_back(platform::CUDAPinnedPlace());
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
@@ -448,7 +448,7 @@ void InitMemoryMethod() {
     memory_method->copy = paddle::memory::Copy<phi::Place, phi::Place>;
     memory_method->device_memory_stat_current_value =
         paddle::memory::DeviceMemoryStatCurrentValue;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     memory_method->gpu_memory_usage = paddle::platform::GpuMemoryUsage;
 #endif
     memory_method->emplace_device_contexts =
diff --git a/paddle/fluid/platform/init_test.cc b/paddle/fluid/platform/init_test.cc
index 66fb431af29e9..b643e37765668 100644
--- a/paddle/fluid/platform/init_test.cc
+++ b/paddle/fluid/platform/init_test.cc
@@ -32,7 +32,7 @@ TEST(InitDevices, CUDA) {
   using paddle::framework::InitDevices;
   using paddle::platform::DeviceContextPool;
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   int count = paddle::platform::GetGPUDeviceCount();
   InitDevices();
   DeviceContextPool& pool = DeviceContextPool::Instance();
diff --git a/paddle/fluid/platform/place.h b/paddle/fluid/platform/place.h
index 959379260419d..9e00bd589dc70 100644
--- a/paddle/fluid/platform/place.h
+++ b/paddle/fluid/platform/place.h
@@ -57,7 +57,7 @@ typename Visitor::result_type VisitPlace(const Place &place,
                                          const Visitor &visitor) {
   switch (place.GetType()) {
     case phi::AllocationType::GPU: {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       platform::CUDAPlace p(place.GetDeviceId());
       return visitor(p);
 #else
@@ -67,7 +67,7 @@ typename Visitor::result_type VisitPlace(const Place &place,
 #endif
     }
     case phi::AllocationType::GPUPINNED: {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       platform::CUDAPinnedPlace p;
       return visitor(p);
 #else
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 2c65023988dc6..d1b557922af32 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -698,7 +698,7 @@ void EnableProfiler(ProfilerState state) {
   HostTraceLevel::GetInstance().SetLevel(option.trace_level);
   should_send_profile_state = true;
   phi::GetDeviceTracer()->Enable();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (phi::ProfilerHelper::g_state == ProfilerState::kCUDA ||
       phi::ProfilerHelper::g_state == ProfilerState::kAll ||
       phi::ProfilerHelper::g_state == ProfilerState::kCPU) {
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index c71b5a0e49104..7a13582736a50 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -31,7 +31,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "paddle/fluid/platform/profiler/mem_tracing.h"
 #include "paddle/fluid/platform/profiler/supplement_tracing.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
 
@@ -197,7 +197,7 @@ std::string OpName(const framework::VariableNameMap& name_map,
                    const std::string& type_name);
 void SetTracerOption(TracerOption option);
 platform::TracerOption GetTracerOption();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 void DummyKernelAndEvent();
 #endif
 
diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.cc b/paddle/fluid/platform/profiler/chrometracing_logger.cc
index e3fe83c5a74d2..8fa4d8a483c4d 100644
--- a/paddle/fluid/platform/profiler/chrometracing_logger.cc
+++ b/paddle/fluid/platform/profiler/chrometracing_logger.cc
@@ -561,7 +561,7 @@ void ChromeTracingLogger::LogMetaInfo(const std::string& version,
                                        span_indx);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 void ChromeTracingLogger::LogDeviceProperty(
     const std::map<uint32_t, gpuDeviceProp>& device_property_map) {
   // add device property information
diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.h b/paddle/fluid/platform/profiler/chrometracing_logger.h
index 7f9bec1c32a53..81005aa91c10d 100644
--- a/paddle/fluid/platform/profiler/chrometracing_logger.h
+++ b/paddle/fluid/platform/profiler/chrometracing_logger.h
@@ -40,7 +40,7 @@ class ChromeTracingLogger : public BaseLogger {
   void LogNodeTrees(const NodeTrees&) override;
   void LogExtraInfo(const std::unordered_map<std::string, std::string>);
   void LogMemTraceEventNode(const MemTraceEventNode&) override;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   void LogDeviceProperty(
       const std::map<uint32_t, gpuDeviceProp>& device_property_map);
 #endif
diff --git a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc
index 1d0970235a128..cc35371e06fc5 100644
--- a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc
+++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc
@@ -129,7 +129,7 @@ std::unique_ptr<ProfilerResult> DeserializationReader::Parse() {
   // restore NodeTrees object
   std::unique_ptr<NodeTrees> tree(new NodeTrees(thread_event_trees_map));
 // restore gpuDeviceProp
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   std::map<uint32_t, gpuDeviceProp> device_property_map;
   for (auto indx = 0; indx < node_trees_proto_->device_property_size();
        indx++) {
@@ -155,7 +155,7 @@ DeserializationReader::~DeserializationReader() {
   input_file_stream_.close();
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 gpuDeviceProp DeserializationReader::RestoreDeviceProperty(
     const DevicePropertyProto& device_property_proto) {
   gpuDeviceProp device_property;
diff --git a/paddle/fluid/platform/profiler/dump/deserialization_reader.h b/paddle/fluid/platform/profiler/dump/deserialization_reader.h
index 5f99f6fd82c55..c8ac33c5bea49 100644
--- a/paddle/fluid/platform/profiler/dump/deserialization_reader.h
+++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.h
@@ -39,7 +39,7 @@ class DeserializationReader {
   MemTraceEventNode* RestoreMemTraceEventNode(const MemTraceEventNodeProto&);
   OperatorSupplementEventNode* RestoreOperatorSupplementEventNode(
       const OperatorSupplementEventNodeProto&);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   gpuDeviceProp RestoreDeviceProperty(const DevicePropertyProto&);
 #endif
 
diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.cc b/paddle/fluid/platform/profiler/dump/serialization_logger.cc
index be1e1c01f8b52..9e46e3a531cd9 100644
--- a/paddle/fluid/platform/profiler/dump/serialization_logger.cc
+++ b/paddle/fluid/platform/profiler/dump/serialization_logger.cc
@@ -40,7 +40,7 @@ void SerializationLogger::OpenFile() {
   node_trees_proto_ = new NodeTreesProto();
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 void SerializationLogger::LogDeviceProperty(
     const std::map<uint32_t, gpuDeviceProp>& device_property_map) {
   for (auto it = device_property_map.begin(); it != device_property_map.end();
diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.h b/paddle/fluid/platform/profiler/dump/serialization_logger.h
index 80d5413106ded..67eafdf44e3cd 100644
--- a/paddle/fluid/platform/profiler/dump/serialization_logger.h
+++ b/paddle/fluid/platform/profiler/dump/serialization_logger.h
@@ -37,7 +37,7 @@ class SerializationLogger : public BaseLogger {
   void LogNodeTrees(const NodeTrees&) override;
   void LogExtraInfo(const std::unordered_map<std::string, std::string>);
   void LogMemTraceEventNode(const MemTraceEventNode&) override;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   void LogDeviceProperty(
       const std::map<uint32_t, gpuDeviceProp>& device_property_map);
 #endif
diff --git a/paddle/fluid/platform/profiler/event_python.cc b/paddle/fluid/platform/profiler/event_python.cc
index eaea4f3850fef..7ec41fd78a5e3 100644
--- a/paddle/fluid/platform/profiler/event_python.cc
+++ b/paddle/fluid/platform/profiler/event_python.cc
@@ -137,7 +137,7 @@ HostPythonNode* ProfilerResult::CopyTree(HostTraceEventNode* root) {
   return host_python_node;
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 ProfilerResult::ProfilerResult(
     std::unique_ptr<NodeTrees> tree,
     const ExtraInfo& extra_info,
@@ -179,7 +179,7 @@ void ProfilerResult::Save(const std::string& file_name,
   if (format == std::string("json")) {
     ChromeTracingLogger logger(file_name);
     logger.LogMetaInfo(version_, span_indx_);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     logger.LogDeviceProperty(device_property_map_);
 #endif
     tree_->LogMe(&logger);
@@ -187,7 +187,7 @@ void ProfilerResult::Save(const std::string& file_name,
   } else if (format == std::string("pb")) {
     SerializationLogger logger(file_name);
     logger.LogMetaInfo(version_, span_indx_);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     logger.LogDeviceProperty(device_property_map_);
 #endif
     tree_->LogMe(&logger);
diff --git a/paddle/fluid/platform/profiler/event_python.h b/paddle/fluid/platform/profiler/event_python.h
index dae32a1902834..f1d217674bf6c 100644
--- a/paddle/fluid/platform/profiler/event_python.h
+++ b/paddle/fluid/platform/profiler/event_python.h
@@ -138,7 +138,7 @@ struct HostPythonNode {
 class ProfilerResult {
  public:
   ProfilerResult() : tree_(nullptr) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   explicit ProfilerResult(
       std::unique_ptr<NodeTrees> tree,
       const ExtraInfo& extra_info,
@@ -166,7 +166,7 @@ class ProfilerResult {
 
   std::string GetVersion() { return version_; }
   uint32_t GetSpanIndx() { return span_indx_; }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   std::map<uint32_t, gpuDeviceProp> GetDeviceProperty() {
     return device_property_map_;
   }
@@ -176,7 +176,7 @@ class ProfilerResult {
   std::map<uint64_t, HostPythonNode*> thread_event_trees_map_;
   std::shared_ptr<NodeTrees> tree_;
   ExtraInfo extra_info_;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   std::map<uint32_t, gpuDeviceProp> device_property_map_;
 #endif
   std::string version_;
diff --git a/paddle/fluid/platform/profiler/profiler.cc b/paddle/fluid/platform/profiler/profiler.cc
index e0a91629a10d6..8f34d5acc0bee 100644
--- a/paddle/fluid/platform/profiler/profiler.cc
+++ b/paddle/fluid/platform/profiler/profiler.cc
@@ -21,7 +21,7 @@
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
 #include "paddle/fluid/platform/enforce.h"
@@ -161,7 +161,7 @@ std::unique_ptr<ProfilerResult> Profiler::Stop() {
                            std::string("%s"),
                            kv.second.c_str());
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   std::map<uint32_t, gpuDeviceProp> device_property_map;
   std::vector<int32_t> device_ids = GetSelectedDevices();
   for (auto index = 0u; index < device_ids.size(); index++) {
diff --git a/paddle/fluid/platform/profiler_helper.h b/paddle/fluid/platform/profiler_helper.h
index 1d34d5fd27b3e..5dad7788d0b09 100644
--- a/paddle/fluid/platform/profiler_helper.h
+++ b/paddle/fluid/platform/profiler_helper.h
@@ -142,7 +142,7 @@ void PrintMemProfiler(
             << "    Memory Profiling Report     "
             << "<-------------------------\n\n";
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   int num_gpus = GetGPUDeviceCount();
   std::cout.setf(std::ios::left);
   if (num_gpus > 0) {
@@ -344,7 +344,7 @@ void SetEvent(bool merge_thread,
     if (rit != pushed_events->rend()) {
       double event_time = 0;
       double gpu_time = 0.0f;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       gpu_time = rit->CudaElapsedMs(analyze_event);
 #endif
       double cpu_time = rit->CpuElapsedMs(analyze_event);
diff --git a/paddle/fluid/platform/profiler_test.cc b/paddle/fluid/platform/profiler_test.cc
index 0e1c681288fe1..1b746df388a2b 100644
--- a/paddle/fluid/platform/profiler_test.cc
+++ b/paddle/fluid/platform/profiler_test.cc
@@ -122,7 +122,7 @@ TEST(RecordEvent, RecordEvent) {
       if (events[i][j].name() == "_start_profiler_") ++start_profiler_count;
       if (events[i][j].name() == "push") {
         EXPECT_EQ(events[i][j + 1].name(), "pop");
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
         EXPECT_GT(events[i][j].CudaElapsedMs(events[i][j + 1]), 0);
 #else
         EXPECT_GT(events[i][j].CpuElapsedMs(events[i][j + 1]), 0);
diff --git a/paddle/fluid/pybind/cuda_streams_py.cc b/paddle/fluid/pybind/cuda_streams_py.cc
index 2b8969e1b8181..da6dee7657c09 100644
--- a/paddle/fluid/pybind/cuda_streams_py.cc
+++ b/paddle/fluid/pybind/cuda_streams_py.cc
@@ -24,7 +24,7 @@ namespace py = pybind11;
 
 namespace paddle {
 namespace platform {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 phi::CUDAStream *get_current_stream(int device_id) {
   if (device_id == -1) {
     device_id = phi::backends::gpu::GetCurrentDeviceId();
@@ -51,7 +51,7 @@ void BindCudaStream(py::module *m_ptr) {
   m.def(
       "_get_current_stream",
       [](int deviceId) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
         return platform::get_current_stream(deviceId);
 #else
         PADDLE_THROW(
@@ -64,7 +64,7 @@ void BindCudaStream(py::module *m_ptr) {
   m.def(
       "_set_current_stream",
       [](phi::CUDAStream *stream) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
         return platform::set_current_stream(stream);
 #else
         PADDLE_THROW(
@@ -75,7 +75,7 @@ void BindCudaStream(py::module *m_ptr) {
       py::return_value_policy::reference);
 
   m.def("_device_synchronize", [](int device_id) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     if (device_id == -1) {
       device_id = paddle::platform::GetCurrentDeviceId();
     }
@@ -115,7 +115,7 @@ void BindCudaStream(py::module *m_ptr) {
             s3 = paddle.device.cuda.Stream()
 
   )DOC")
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       .def(
           "wait_event",
           [](phi::CUDAStream &self, paddle::platform::CudaEvent &event) {
@@ -251,7 +251,7 @@ void BindCudaStream(py::module *m_ptr) {
       .def(
           "__init__",
           [](phi::CUDAStream &self, platform::CUDAPlace *place, int priority) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
             if (priority != 1 && priority != 2) {
               PADDLE_THROW(platform::errors::InvalidArgument(
                   "Priority should be 1(high) or 2(normal) "));
@@ -277,7 +277,7 @@ void BindCudaStream(py::module *m_ptr) {
       .def(
           "__init__",
           [](phi::CUDAStream &self, int device, int priority) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
             if (priority != 1 && priority != 2) {
               PADDLE_THROW(platform::errors::InvalidArgument(
                   "Priority should be 1(high) or 2(normal) "));
@@ -307,7 +307,7 @@ void BindCudaStream(py::module *m_ptr) {
           py::arg("device") = -1,
           py::arg("priority") = 2)
       .def("__init__", [](phi::CUDAStream &self) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
         int device_id = platform::GetCurrentDeviceId();
         auto stream_flag = phi::CUDAStream::StreamFlag::kStreamNonBlocking;
         new (&self) phi::CUDAStream(
@@ -334,7 +334,7 @@ void BindCudaStream(py::module *m_ptr) {
             event = paddle.device.cuda.Event()
 
   )DOC")
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       .def(
           "record",
           [](paddle::platform::CudaEvent &self, phi::CUDAStream *stream) {
@@ -398,7 +398,7 @@ void BindCudaStream(py::module *m_ptr) {
              bool enable_timing,
              bool blocking,
              bool interprocess) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
             unsigned int flags = platform::GenerateDeviceEventFlag(
                 enable_timing, blocking, interprocess);
             new (&self) paddle::platform::CudaEvent(flags);
diff --git a/paddle/fluid/pybind/cuda_streams_py.h b/paddle/fluid/pybind/cuda_streams_py.h
index d10608a6e8ea9..61f27960e25e9 100644
--- a/paddle/fluid/pybind/cuda_streams_py.h
+++ b/paddle/fluid/pybind/cuda_streams_py.h
@@ -17,7 +17,7 @@
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/phi/core/cuda_stream.h"
 #else
 namespace phi {
@@ -29,7 +29,7 @@ namespace py = pybind11;
 
 namespace paddle {
 namespace platform {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 phi::CUDAStream* get_current_stream(int device_id = -1);
 phi::CUDAStream* set_current_stream(phi::CUDAStream* stream);
 #endif
diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc
index 59a94a31c448d..7fdfcfe62f6a6 100644
--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -58,7 +58,7 @@ typedef SSIZE_T ssize_t;
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/pybind/cuda_streams_py.h"
 #endif
 
diff --git a/paddle/fluid/pybind/eager_math_op_patch.cc b/paddle/fluid/pybind/eager_math_op_patch.cc
index 69d0465bf7cdd..46e099c1ecf5f 100644
--- a/paddle/fluid/pybind/eager_math_op_patch.cc
+++ b/paddle/fluid/pybind/eager_math_op_patch.cc
@@ -138,7 +138,7 @@ std::set<phi::DataType> _complex_dtypes{
 
 void SetDevice(paddle::platform::Place place) {
   if (paddle::platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     phi::backends::gpu::SetDeviceId(place.device);
     VLOG(6) << "CurrentDeviceId: " << phi::backends::gpu::GetCurrentDeviceId()
             << " from " << static_cast<int>(place.device);
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index eb0e895cf575c..638542ea6dbaf 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -223,7 +223,7 @@ static PyObject* tensor_method_numpy(TensorObject* self,
           sizeof_dtype * numel);
     }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   } else if (self->tensor.is_gpu()) {
     eager_gil_scoped_release guard;
 #if defined(PADDLE_WITH_CUDA)
@@ -1338,7 +1338,7 @@ static PyObject* tensor_method__setitem_eager_tensor(TensorObject* self,
       self_numpy[_index] = py::object(py::handle(value_obj), true);
     }
     if (!self->tensor.initialized()) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       SetTensorFromPyArray(self_tensor,
                            self_numpy,
                            platform::Place(platform::CUDAPlace(0)),
diff --git a/paddle/fluid/pybind/generator_py.cc b/paddle/fluid/pybind/generator_py.cc
index 99621b1463ea9..c9a4e2b7fb52e 100644
--- a/paddle/fluid/pybind/generator_py.cc
+++ b/paddle/fluid/pybind/generator_py.cc
@@ -40,7 +40,7 @@ void BindGenerator(py::module* m_ptr) {
            [](std::shared_ptr<phi::Generator::GeneratorState>& self) {
              return self->current_seed;
            })
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       // NOTE(shenliang03): Due to the inability to serialize mt19937_64
       // type, resulting in a problem with precision under the cpu.
       .def(py::pickle(
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index bdf54bd76b6e1..72b47bb154513 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -43,7 +43,7 @@
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/phi/core/cuda_stream.h"
 #endif
 
@@ -658,7 +658,7 @@ void BindPaddlePredictor(py::module *m) {
       .def("get_output_names", &PaddlePredictor::GetOutputNames)
       .def("zero_copy_run", &PaddlePredictor::ZeroCopyRun)
       .def("clone", [](PaddlePredictor &self) { return self.Clone(nullptr); })
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       .def("clone",
            [](PaddlePredictor &self, phi::CUDAStream &stream) {
              return self.Clone(stream.raw_stream());
@@ -705,7 +705,7 @@ void BindNativePredictor(py::module *m) {
       .def("zero_copy_run", &NativePaddlePredictor::ZeroCopyRun)
       .def("clone",
            [](NativePaddlePredictor &self) { return self.Clone(nullptr); })
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       .def("clone",
            [](NativePaddlePredictor &self, phi::CUDAStream &stream) {
              return self.Clone(stream.raw_stream());
@@ -750,7 +750,7 @@ void BindAnalysisConfig(py::module *m) {
       .def("exp_enable_use_cutlass", &AnalysisConfig::Exp_EnableUseCutlass)
       .def("exp_disable_mixed_precision_ops",
            &AnalysisConfig::Exp_DisableMixedPrecisionOps)
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       .def("set_exec_stream",
            [](AnalysisConfig &self, phi::CUDAStream &stream) {
              self.SetExecStream(stream.raw_stream());
@@ -1084,7 +1084,7 @@ void BindAnalysisPredictor(py::module *m) {
            &AnalysisPredictor::analysis_argument,
            py::return_value_policy::reference)
       .def("clone", [](AnalysisPredictor &self) { return self.Clone(nullptr); })
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       .def("clone",
            [](AnalysisPredictor &self, phi::CUDAStream &stream) {
              return self.Clone(stream.raw_stream());
@@ -1122,7 +1122,7 @@ void BindPaddleInferPredictor(py::module *m) {
       .def("run", [](paddle_infer::Predictor &self) { self.Run(); })
       .def("clone",
            [](paddle_infer::Predictor &self) { return self.Clone(nullptr); })
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       .def("clone",
            [](paddle_infer::Predictor &self, phi::CUDAStream &stream) {
              return self.Clone(stream.raw_stream());
diff --git a/paddle/fluid/pybind/parallel_executor.cc b/paddle/fluid/pybind/parallel_executor.cc
index 9ba115381a2c0..013cac0851154 100644
--- a/paddle/fluid/pybind/parallel_executor.cc
+++ b/paddle/fluid/pybind/parallel_executor.cc
@@ -126,7 +126,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/reader_py.h"
 #include "paddle/fluid/pybind/tensor_py.h"
 #include "paddle/fluid/string/to_string.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
 #endif
diff --git a/paddle/fluid/pybind/place.cc b/paddle/fluid/pybind/place.cc
index c97bba9be8f2f..2edb4c80d4897 100644
--- a/paddle/fluid/pybind/place.cc
+++ b/paddle/fluid/pybind/place.cc
@@ -126,11 +126,11 @@ limitations under the License. */
 #include "paddle/fluid/pybind/reader_py.h"
 #include "paddle/fluid/pybind/tensor_py.h"
 #include "paddle/fluid/string/to_string.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_MUSA)
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
 #endif
-#ifndef PADDLE_WITH_HIP
+#ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h"
 #endif
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
@@ -318,7 +318,7 @@ void BindPlace(pybind11::module &m) {  // NOLINT
   cudaplace
       .def("__init__",
            [](platform::CUDAPlace &self, int dev_id) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_MUSA)
              if (UNLIKELY(dev_id < 0)) {
                LOG(ERROR) << string::Sprintf(
                    "Invalid CUDAPlace(%d), device id must be 0 or "
@@ -357,7 +357,7 @@ void BindPlace(pybind11::module &m) {  // NOLINT
              std::exit(-1);
 #endif
            })
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_MUSA)
       .def("get_device_id",
            [](const platform::CUDAPlace &self) { return self.GetDeviceId(); })
       .def("_type", &PlaceIndex<platform::CUDAPlace>)
@@ -372,10 +372,10 @@ void BindPlace(pybind11::module &m) {  // NOLINT
 #endif
       .def("__repr__", string::to_string<const platform::CUDAPlace &>)
       .def("__str__", string::to_string<const platform::CUDAPlace &>);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool {
   // Only GPUs with Compute Capability >= 53 support float16
-#ifdef PADDLE_WITH_HIP
+#ifdef PADDLE_WITH_HIP || PADDLE_WITH_MUSA
     return true;
 #else
     return platform::GetGPUComputeCapability(place.device) >= 53;
@@ -383,7 +383,7 @@ void BindPlace(pybind11::module &m) {  // NOLINT
   });
   m.def("is_bfloat16_supported", [](const platform::CUDAPlace &place) -> bool {
   // Only GPUs with Compute Capability >= 80 support bfloat16
-#ifdef PADDLE_WITH_HIP
+#ifdef PADDLE_WITH_HIP || PADDLE_WITH_MUSA
     return false;
 #else
     return platform::GetGPUComputeCapability(place.device) >= 80;
@@ -540,7 +540,7 @@ void BindPlace(pybind11::module &m) {  // NOLINT
   cudapinnedplace
       .def("__init__",
            [](platform::CUDAPinnedPlace &self) {
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
              PADDLE_THROW(platform::errors::PermissionDenied(
                  "Cannot use CUDAPinnedPlace in CPU only version, "
                  "Please recompile or reinstall Paddle with CUDA support."));
diff --git a/paddle/fluid/pybind/process_group_utils.h b/paddle/fluid/pybind/process_group_utils.h
index 1a6b640b3a3cf..1a5a048a61383 100644
--- a/paddle/fluid/pybind/process_group_utils.h
+++ b/paddle/fluid/pybind/process_group_utils.h
@@ -250,7 +250,7 @@ void ConcatTensor(const phi::DeviceContext &dev_ctx,
 
   const auto &place = dev_ctx.GetPlace();
   if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     ConcatDenseTensorWithType(static_cast<const phi::GPUContext &>(dev_ctx),
                               tensor_list,
                               dense_tensor,
@@ -307,7 +307,7 @@ void SplitTensor(const phi::DeviceContext &dev_ctx,
 
   const auto &place = dev_ctx.GetPlace();
   if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     SplitDenseTensorWithType(static_cast<const phi::GPUContext &>(dev_ctx),
                              tensor,
                              &dense_list,
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 60ade1f9875fd..3f5fffc1bc036 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -144,7 +144,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/tensor.h"
 #include "paddle/fluid/pybind/tensor_py.h"
 #include "paddle/fluid/string/to_string.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
 #endif
@@ -776,7 +776,7 @@ PYBIND11_MODULE(libpaddle, m) {
           }
         });
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   m.def("cudnn_version", &platform::DnnVersion);
   m.def("gpu_memory_available", []() {
     size_t available = 0;
@@ -828,7 +828,7 @@ PYBIND11_MODULE(libpaddle, m) {
     if (dl.device.device_type == kDLCPU) {
       paddle::framework::TensorFromDLPack(dmt, &tensor);
     }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     if (dl.device.device_type == kDLGPU) {
       paddle::framework::TensorFromDLPack(dmt, &tensor);
     }
@@ -2199,7 +2199,7 @@ All parameter, weight, gradient are variables in Paddle.
           py::return_value_policy::take_ownership);
 
   m.def("op_support_gpu", OpSupportGPU);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   m.def("get_cuda_device_count", platform::GetGPUDeviceCount);
   m.def("get_cuda_current_device_id", &platform::GetCurrentDeviceId);
   m.def("cuda_empty_cache", [] {
@@ -2320,7 +2320,7 @@ All parameter, weight, gradient are variables in Paddle.
       .def("save", &paddle::platform::ProfilerResult::Save)
       .def("get_extra_info", &paddle::platform::ProfilerResult::GetExtraInfo)
       .def("get_version", &paddle::platform::ProfilerResult::GetVersion)
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       .def("get_span_indx", &paddle::platform::ProfilerResult::GetSpanIndx)
       .def("get_device_property",
            &paddle::platform::ProfilerResult::GetDeviceProperty);
@@ -2477,7 +2477,7 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("enable_op_info_recorder", &phi::EnableOpInfoRecorder);
   m.def("disable_op_info_recorder", &phi::DisableOpInfoRecorder);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   m.def("set_cublas_switch", phi::SetAllowTF32Cublas);
   m.def("get_cublas_switch", phi::AllowTF32Cublas);
   m.def("set_cudnn_switch", phi::SetAllowTF32Cudnn);
diff --git a/paddle/fluid/pybind/tensor.cc b/paddle/fluid/pybind/tensor.cc
index 98ae45dd0134b..a9ce5910d4eb4 100644
--- a/paddle/fluid/pybind/tensor.cc
+++ b/paddle/fluid/pybind/tensor.cc
@@ -126,7 +126,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/reader_py.h"
 #include "paddle/fluid/pybind/tensor_py.h"
 #include "paddle/fluid/string/to_string.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
 #endif
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index ba33fcd1d129f..8b4f4dcd62de1 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -37,7 +37,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/pybind/complex.h"
 #include "paddle/phi/kernels/funcs/strided_memcpy.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
 #include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
@@ -325,7 +325,7 @@ T TensorGetElement(const phi::DenseTensor &self, size_t offset) {
 #endif
   } else if (platform::is_gpu_place(self.place()) ||
              platform::is_cuda_pinned_place(self.place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     const T *a = self.data<T>();
     auto p = self.place();
     paddle::memory::Copy(
@@ -362,7 +362,7 @@ void TensorSetElement(phi::DenseTensor *self, size_t offset, T elem) {
 #endif
   } else if (platform::is_gpu_place(self->place()) ||
              platform::is_cuda_pinned_place(self->place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     auto p = self->place();
     T *a = self->mutable_data<T>(p);
     paddle::memory::Copy(
@@ -457,7 +457,7 @@ void SetTensorFromPyArrayT(
         "Please recompile or reinstall Paddle with CustomDevice support."));
 #endif
   } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     if (paddle::platform::is_gpu_place(place)) {
       // NOTE(wangxi): When copying data to the accelerator card,
       // we need set_device(dev_id) first.
@@ -790,7 +790,7 @@ inline phi::DenseTensor *_getTensor(const phi::DenseTensor &self,
     output->mutable_data(place, self.dtype());
 #endif
   } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     if (platform::is_cuda_pinned_place(place)) {
       output->mutable_data(place, self.dtype());
     } else if ((platform::is_gpu_place(place))) {
@@ -1039,7 +1039,7 @@ inline py::array TensorToPyArray(const phi::DenseTensor &tensor,
         "Please recompile or reinstall Paddle with XPU support."));
 #endif
   } else if (is_gpu_tensor) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides);
     PADDLE_ENFORCE_EQ(py_arr.writeable(),
                       true,
diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt
index 1ed3fac122826..593109d3e8e27 100644
--- a/paddle/phi/CMakeLists.txt
+++ b/paddle/phi/CMakeLists.txt
@@ -123,6 +123,9 @@ if(WITH_GPU)
 elseif(WITH_ROCM)
   hip_add_library(phi ${PHI_BUILD_TYPE} ${PHI_SRCS})
   target_link_libraries(phi ${PHI_DEPS})
+elseif(WITH_MUSA)
+  musa_add_library(phi ${PHI_BUILD_TYPE} ${PHI_SRCS})
+  target_link_libraries(phi ${PHI_DEPS})
 elseif(WITH_XPU_KP)
   xpu_library(
     phi ${PHI_BUILD_TYPE}
diff --git a/paddle/phi/api/include/context_pool.h b/paddle/phi/api/include/context_pool.h
index 7afe17ba8419d..65ddeceb7014c 100644
--- a/paddle/phi/api/include/context_pool.h
+++ b/paddle/phi/api/include/context_pool.h
@@ -97,7 +97,7 @@ namespace paddle {
  */
 PADDLE_API phi::Allocator* GetAllocator(const phi::Place& place);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 /**
  * Get the current CUDA stream for the passed CUDA device.
  */
diff --git a/paddle/phi/api/include/tensor.h b/paddle/phi/api/include/tensor.h
index b626df6c6701c..4224aeae2b5c3 100644
--- a/paddle/phi/api/include/tensor.h
+++ b/paddle/phi/api/include/tensor.h
@@ -396,7 +396,7 @@ class PADDLE_API Tensor final {
    */
   void set_impl(std::shared_ptr<phi::TensorBase>&& impl);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   /**
    * @brief Get the stream where the tensor is currently located
    * This is a deprecated method and may be removed in the future!
diff --git a/paddle/phi/api/lib/context_pool.cc b/paddle/phi/api/lib/context_pool.cc
index 292bd8a7e47aa..b3badfdb94ff7 100644
--- a/paddle/phi/api/lib/context_pool.cc
+++ b/paddle/phi/api/lib/context_pool.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/phi/core/allocator.h"
 #include "paddle/phi/core/enforce.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/phi/core/cuda_stream.h"
 #endif
 
@@ -63,7 +63,7 @@ PADDLE_API phi::Allocator* GetAllocator(const phi::Place& place) {
   return const_cast<phi::Allocator*>(&dev_ctx->GetAllocator());
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PADDLE_API phi::CUDAStream* GetCurrentCUDAStream(const phi::Place& place) {
   PADDLE_ENFORCE_EQ(place.GetType(),
                     phi::AllocationType::GPU,
diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc
index f9316965be26b..eac1d34ada374 100644
--- a/paddle/phi/api/lib/data_transform.cc
+++ b/paddle/phi/api/lib/data_transform.cc
@@ -93,7 +93,7 @@ phi::DenseTensor CastDataType(const Context& dev_ctx,
   }
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 phi::DenseTensor CastDataType(const phi::GPUContext& dev_ctx,
                               const phi::DenseTensor& tensor,
                               DataType dtype) {
@@ -135,7 +135,7 @@ inline phi::DenseTensor TransDataType(const phi::DenseTensor& tensor,
   if (tensor.place().GetType() == phi::AllocationType::CPU) {
     auto* dev_ctx = static_cast<phi::CPUContext*>(pool.Get(tensor.place()));
     return CastDataType(*dev_ctx, tensor, dtype);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   } else if (tensor.place().GetType() == phi::AllocationType::GPU) {
     auto* dev_ctx = static_cast<phi::GPUContext*>(pool.Get(tensor.place()));
     return CastDataType(*dev_ctx, tensor, dtype);
@@ -153,7 +153,7 @@ inline phi::DenseTensor TransDataPlace(const phi::DenseTensor& tensor,
           << " dst_place: " << dst_place;
 
   auto& pool = phi::DeviceContextPool::Instance();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   // NOTE(yy): TransDataPlace should wait for computation of input.
   if (tensor.place().GetType() != phi::AllocationType::GPUPINNED) {
     pool.Get(tensor.place())->Wait();
diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc
index e8caf52530868..4a0b8426fa8d8 100644
--- a/paddle/phi/api/lib/tensor.cc
+++ b/paddle/phi/api/lib/tensor.cc
@@ -359,7 +359,7 @@ void Tensor::set_impl(std::shared_ptr<phi::TensorBase> &&impl) {
   impl_ = std::move(impl);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 gpuStream_t Tensor::stream() const {
   int device_id = phi::backends::gpu::GetCurrentDeviceId();
   auto *gpu_context = DeviceContextPool::Instance().Get<AllocationType::GPU>(
diff --git a/paddle/phi/api/lib/tensor_utils.cc b/paddle/phi/api/lib/tensor_utils.cc
index b8d25e4f22b10..3384b59158703 100644
--- a/paddle/phi/api/lib/tensor_utils.cc
+++ b/paddle/phi/api/lib/tensor_utils.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/phi/api/lib/api_registry.h"
 #include "paddle/phi/core/dense_tensor.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #ifdef PADDLE_WITH_CUDA
 #include <cuda_runtime.h>
 #else
@@ -30,7 +30,7 @@ namespace paddle {
 PD_REGISTER_API(from_blob)
 
 phi::Place GetPlaceFromPtr(void* data) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #ifdef PADDLE_WITH_CUDA
 #if CUDA_VERSION >= 10000
   cudaPointerAttributes attr;
diff --git a/paddle/phi/api/profiler/event.h b/paddle/phi/api/profiler/event.h
index b19f20485227b..ebd613e4a8099 100644
--- a/paddle/phi/api/profiler/event.h
+++ b/paddle/phi/api/profiler/event.h
@@ -28,7 +28,7 @@ limitations under the License. */
 #include <hip/hip_runtime.h>
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/phi/core/cuda_stream.h"
 #endif
 
@@ -62,7 +62,7 @@ class Event {
   void set_name(std::string name) { name_ = name; }
   void set_role(EventRole role) { role_ = role; }
   std::string attr() const { return attr_; }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #ifndef PADDLE_WITH_CUPTI
   gpuEvent_t event() const { return event_; }
   int device() const { return device_; }
@@ -81,7 +81,7 @@ class Event {
   int64_t cpu_ns_;
   bool visited_status_{false};
   std::string attr_;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #ifdef PADDLE_WITH_CUPTI
   int64_t gpu_ns_ = 0;
 
@@ -137,7 +137,7 @@ class MemEvent {
 };
 
 class CudaEvent {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
  public:
   CudaEvent() {
diff --git a/paddle/phi/backends/CMakeLists.txt b/paddle/phi/backends/CMakeLists.txt
index 1c916682cf7b1..beb0f88e3efcf 100644
--- a/paddle/phi/backends/CMakeLists.txt
+++ b/paddle/phi/backends/CMakeLists.txt
@@ -7,7 +7,7 @@ if(NOT APPLE AND NOT WIN32)
   list(APPEND BACKENDS_SRCS device_code.cc)
 endif()
 
-if(WITH_GPU OR WITH_ROCM)
+if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
   list(APPEND BACKENDS_SRCS gpu/gpu_context.cc gpu/gpu_info.cc
        gpu/gpu_resources.cc)
   if(WITH_GPU)
@@ -16,6 +16,9 @@ if(WITH_GPU OR WITH_ROCM)
   if(WITH_ROCM)
     list(APPEND BACKENDS_SRCS gpu/rocm/rocm_info.cc)
   endif()
+  if(WITH_MUSA)
+    list(APPEND BACKENDS_SRCS gpu/musa/musa_info.cc)
+  endif()
 endif()
 
 if(WITH_XPU)
@@ -43,6 +46,7 @@ list(
 
 if(WITH_GPU
    OR WITH_ROCM
+   OR WITH_MUSA
    OR WITH_CUSTOM_DEVICE)
   list(APPEND BACKENDS_SRCS device_base.cc)
 endif()
diff --git a/paddle/phi/backends/context_pool.cc b/paddle/phi/backends/context_pool.cc
index e295ac388d892..372edd66e50d9 100644
--- a/paddle/phi/backends/context_pool.cc
+++ b/paddle/phi/backends/context_pool.cc
@@ -21,7 +21,7 @@ limitations under the License. */
 
 namespace phi {
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 bool allow_tf32_cublas = true;
 void SetAllowTF32Cublas(bool active) { allow_tf32_cublas = active; }
 bool AllowTF32Cublas() { return allow_tf32_cublas; }
diff --git a/paddle/phi/backends/context_pool.h b/paddle/phi/backends/context_pool.h
index 6ff90e05fed4a..efce5aac61a71 100644
--- a/paddle/phi/backends/context_pool.h
+++ b/paddle/phi/backends/context_pool.h
@@ -27,7 +27,7 @@ limitations under the License. */
 
 namespace phi {
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 void SetAllowTF32Cublas(bool active);
 /*Get the global variable allow_tf32_cublas value*/
 bool AllowTF32Cublas();
@@ -46,7 +46,7 @@ struct DefaultDeviceContextType<phi::CPUPlace> {
   using TYPE = phi::CPUContext;
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 template <>
 struct DefaultDeviceContextType<phi::GPUPlace> {
   using TYPE = phi::GPUContext;
diff --git a/paddle/phi/backends/device_code.cc b/paddle/phi/backends/device_code.cc
index eb2934d1b4842..27cdf09236d35 100644
--- a/paddle/phi/backends/device_code.cc
+++ b/paddle/phi/backends/device_code.cc
@@ -78,7 +78,7 @@ DeviceCodePool::DeviceCodePool(const std::vector<phi::Place>& places) {
   }
   for (auto& p : set) {
     if (p.GetType() == phi::AllocationType::GPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       device_codes_.emplace(p, DeviceCodeMap());
 #else
       PADDLE_THROW(phi::errors::PreconditionNotMet(
@@ -88,12 +88,12 @@ DeviceCodePool::DeviceCodePool(const std::vector<phi::Place>& places) {
     }
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   GPUDeviceCode::CheckAvailableStatus();
 #endif
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #ifdef PADDLE_WITH_HIP
 static bool CheckCUDADriverResult(hipError_t result,
                                   std::string caller,
diff --git a/paddle/phi/backends/device_code.h b/paddle/phi/backends/device_code.h
index 8debb4dc9c45e..64b89b83b42ed 100644
--- a/paddle/phi/backends/device_code.h
+++ b/paddle/phi/backends/device_code.h
@@ -48,7 +48,7 @@ class DeviceCode {
   std::string kernel_;
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 class GPUDeviceCode : public DeviceCode {
  public:
   explicit GPUDeviceCode(const Place& place,
diff --git a/paddle/phi/backends/device_memory_aligment.h b/paddle/phi/backends/device_memory_aligment.h
index 8508d5206558d..3a430132d9325 100644
--- a/paddle/phi/backends/device_memory_aligment.h
+++ b/paddle/phi/backends/device_memory_aligment.h
@@ -36,7 +36,7 @@ inline size_t Alignment(size_t size,
     if (place.GetType() == phi::AllocationType::CPU) {
       alignment = phi::backends::cpu::CpuMinChunkSize();
     } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       alignment = phi::backends::gpu::GpuMinChunkSize();
 #elif defined(PADDLE_WITH_XPU)
       alignment = phi::backends::xpu::XPUMinChunkSize();
diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc
index 5c9c010d365e4..f10ec7019b7b6 100644
--- a/paddle/phi/backends/gpu/gpu_context.cc
+++ b/paddle/phi/backends/gpu/gpu_context.cc
@@ -1046,7 +1046,7 @@ void GPUContext::SetDnnAttr(const std::string& attr_name, Attribute attr) {
 
 void GPUContext::ClearDnnAttr() { return impl_->ClearDnnAttr(); }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 GPUPinnedContext::GPUPinnedContext() {
   eigen_device_.reset(new Eigen::DefaultDevice());
 }
diff --git a/paddle/phi/backends/gpu/gpu_context.h b/paddle/phi/backends/gpu/gpu_context.h
index b4a3974378241..2127114de189c 100644
--- a/paddle/phi/backends/gpu/gpu_context.h
+++ b/paddle/phi/backends/gpu/gpu_context.h
@@ -15,7 +15,7 @@ limitations under the License. */
 
 #pragma once
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
     defined(PADDLE_WITH_XPU_KP)
 
 #include <array>
@@ -276,7 +276,7 @@ using GPUDNNContext = GPUContext;
 // because we want to implement a KPS-based kernel and make it run
 // on GPU and XPU at the same time, so we need KPSContext when registering
 // KPS Kernel. Note: XPU and GPU cannot be compiled at the same time!
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 using KPSContext = GPUContext;
 #endif
 
@@ -287,7 +287,7 @@ struct DefaultDevice;
 }  // namespace Eigen
 
 namespace phi {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 // Currently, GPUPinnedContext is only used to data copying.
 class GPUPinnedContext
     : public DeviceContext,
diff --git a/paddle/phi/backends/gpu/gpu_device_function.h b/paddle/phi/backends/gpu/gpu_device_function.h
index 0f79e2a645ab3..de4565cb6e7ce 100644
--- a/paddle/phi/backends/gpu/gpu_device_function.h
+++ b/paddle/phi/backends/gpu/gpu_device_function.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/backends/gpu/rocm/rocm_device_function.h"
diff --git a/paddle/phi/backends/gpu/gpu_dnn.h b/paddle/phi/backends/gpu/gpu_dnn.h
index f37afa3deeb74..44163d8048f2c 100644
--- a/paddle/phi/backends/gpu/gpu_dnn.h
+++ b/paddle/phi/backends/gpu/gpu_dnn.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/backends/gpu/rocm/miopen_desc.h"
diff --git a/paddle/phi/backends/gpu/gpu_helper.h b/paddle/phi/backends/gpu/gpu_helper.h
index 2353b42794ffd..428c5dcb96c6a 100644
--- a/paddle/phi/backends/gpu/gpu_helper.h
+++ b/paddle/phi/backends/gpu/gpu_helper.h
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #pragma once
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/backends/gpu/rocm/rocm_helper.h"
diff --git a/paddle/phi/backends/gpu/gpu_info.h b/paddle/phi/backends/gpu/gpu_info.h
index ebf57bd06eb19..2d1b7c1a98f27 100644
--- a/paddle/phi/backends/gpu/gpu_info.h
+++ b/paddle/phi/backends/gpu/gpu_info.h
@@ -11,7 +11,7 @@ limitations under the License. */
 
 #pragma once
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 #include <stddef.h>
 
diff --git a/paddle/phi/backends/gpu/gpu_launch_config.h b/paddle/phi/backends/gpu/gpu_launch_config.h
index a7a7ad03ad664..00aa244041bec 100644
--- a/paddle/phi/backends/gpu/gpu_launch_config.h
+++ b/paddle/phi/backends/gpu/gpu_launch_config.h
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 #ifdef PADDLE_WITH_CUDA
 #include <cuda_runtime.h>
diff --git a/paddle/phi/backends/gpu/gpu_types.h b/paddle/phi/backends/gpu/gpu_types.h
index 77f403795b6b3..effab17059ac4 100644
--- a/paddle/phi/backends/gpu/gpu_types.h
+++ b/paddle/phi/backends/gpu/gpu_types.h
@@ -17,7 +17,7 @@
 #include "paddle/phi/backends/gpu/forwards.h"
 #include "paddle/phi/backends/gpu/gpu_decls.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/backends/dynload/miopen.h"
@@ -80,4 +80,4 @@ DECLARE_CONSTANT_FOR_GPU(gpuMemcpyDeviceToDevice,
 #undef DECLARE_CONSTANT_FOR_GPU
 }  // namespace phi
 
-#endif  // defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#endif  // defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
diff --git a/paddle/phi/backends/gpu/musa/musa_info.cc b/paddle/phi/backends/gpu/musa/musa_info.cc
new file mode 100644
index 0000000000000..6579ce63f21f6
--- /dev/null
+++ b/paddle/phi/backends/gpu/musa/musa_info.cc
@@ -0,0 +1,329 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <array>
+
+#include "paddle/phi/backends/gpu/gpu_info.h"
+
+#include "paddle/phi/core/enforce.h"
+
+#include "musa_runtime_api.h"
+
+static std::once_flag g_device_props_size_init_flag;
+static std::vector<std::unique_ptr<std::once_flag>> g_device_props_init_flags;
+static std::vector<phi::gpuDeviceProp> g_device_props;
+
+namespace phi {
+namespace backends {
+namespace gpu {
+
+int DnnVersion() {
+  return 0.0.0;
+  //if (!dynload::HasCUDNN()) return -1;
+  //size_t version_major, version_minor, version_patch;
+  //PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenGetVersion(
+  //    &version_major, &version_minor, &version_patch));
+  //return version_major * 100 + version_minor * 10 + version_patch;
+}
+
+static int GetGPUDeviceCountImpl() {
+  int driverVersion = 0;
+  musaError_t status = musaDriverGetVersion(&driverVersion);
+
+  if (!(status == gpuSuccess && driverVersion != 0)) {
+    // No GPU driver
+    VLOG(2) << "GPU Driver Version can't be detected. No GPU driver!";
+    return 0;
+  }
+
+  const auto *cuda_visible_devices = std::getenv("MUSA_VISIBLE_DEVICES");
+
+  if (cuda_visible_devices != nullptr) {
+    std::string cuda_visible_devices_str(cuda_visible_devices);
+    if (!cuda_visible_devices_str.empty()) {
+      cuda_visible_devices_str.erase(
+          0, cuda_visible_devices_str.find_first_not_of('\''));
+      cuda_visible_devices_str.erase(
+          cuda_visible_devices_str.find_last_not_of('\'') + 1);
+      cuda_visible_devices_str.erase(
+          0, cuda_visible_devices_str.find_first_not_of('\"'));
+      cuda_visible_devices_str.erase(
+          cuda_visible_devices_str.find_last_not_of('\"') + 1);
+    }
+    if (std::all_of(cuda_visible_devices_str.begin(),
+                    cuda_visible_devices_str.end(),
+                    [](char ch) { return ch == ' '; })) {
+      VLOG(2) << "MUSA_VISIBLE_DEVICES is set to be "
+                 "empty. No GPU detected.";
+      return 0;
+    }
+  }
+  int count;
+  PADDLE_ENFORCE_GPU_SUCCESS(musaGetDeviceCount(&count));
+  return count;
+}
+
+int GetGPUDeviceCount() {
+  // cache the count
+  static auto dev_cnt = GetGPUDeviceCountImpl();
+  return dev_cnt;
+}
+
+int GetGPUComputeCapability(int id) {
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
+  return 100;
+  //int major, minor;
+  //auto major_error_code = musaDeviceGetAttribute(
+  //    &major, musaDeviceAttributeComputeCapabilityMajor, id);
+  //auto minor_error_code = musaDeviceGetAttribute(
+  //    &minor, musaDeviceAttributeComputeCapabilityMinor, id);
+
+  //PADDLE_ENFORCE_GPU_SUCCESS(major_error_code);
+  //PADDLE_ENFORCE_GPU_SUCCESS(minor_error_code);
+  //return major * 100 + minor;
+}
+
+int GetGPURuntimeVersion(int id) {
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
+  int runtime_version = 0;
+  PADDLE_ENFORCE_GPU_SUCCESS(musaRuntimeGetVersion(&runtime_version));
+  return runtime_version;
+}
+
+int GetGPUDriverVersion(int id) {
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
+  int driver_version = 0;
+  PADDLE_ENFORCE_GPU_SUCCESS(musaDriverGetVersion(&driver_version));
+  return driver_version;
+}
+
+bool TensorCoreAvailable() { return false; }
+
+int GetGPUMultiProcessors(int id) {
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
+  int count;
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      musaDeviceGetAttribute(&count, musaDeviceAttributeMultiprocessorCount, id));
+  return count;
+}
+
+int GetGPUMaxThreadsPerMultiProcessor(int id) {
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
+  int count;
+  PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceGetAttribute(
+      &count, musaDeviceAttributeMaxThreadsPerMultiProcessor, id));
+
+  return count;
+}
+
+int GetGPUMaxThreadsPerBlock(int id) {
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
+  int count;
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      musaDeviceGetAttribute(&count, musaDeviceAttributeMaxThreadsPerBlock, id));
+  return count;
+}
+
+int GetCurrentDeviceId() {
+  int device_id;
+  PADDLE_ENFORCE_GPU_SUCCESS(musaGetDevice(&device_id));
+  return device_id;
+}
+
+std::array<int, 3> GetGpuMaxGridDimSize(int id) {
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
+  std::array<int, 3> ret;
+  int size;
+  auto error_code_x =
+      musaDeviceGetAttribute(&size, musaDeviceAttributeMaxGridDimX, id);
+  PADDLE_ENFORCE_GPU_SUCCESS(error_code_x);
+  ret[0] = size;
+
+  auto error_code_y =
+      musaDeviceGetAttribute(&size, musaDeviceAttributeMaxGridDimY, id);
+  PADDLE_ENFORCE_GPU_SUCCESS(error_code_y);
+  ret[1] = size;
+
+  auto error_code_z =
+      musaDeviceGetAttribute(&size, musaDeviceAttributeMaxGridDimZ, id);
+  PADDLE_ENFORCE_GPU_SUCCESS(error_code_z);
+  ret[2] = size;
+  return ret;
+}
+
+std::pair<int, int> GetGpuStreamPriorityRange() {
+  int least_priority, greatest_priority;
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      musaDeviceGetStreamPriorityRange(&least_priority, &greatest_priority));
+  return std::make_pair(least_priority, greatest_priority);
+}
+
+const gpuDeviceProp &GetDeviceProperties(int id) {
+  std::call_once(g_device_props_size_init_flag, [&] {
+    int gpu_num = 0;
+    gpu_num = GetGPUDeviceCount();
+    g_device_props_init_flags.resize(gpu_num);
+    g_device_props.resize(gpu_num);
+    for (int i = 0; i < gpu_num; ++i) {
+      g_device_props_init_flags[i] = std::make_unique<std::once_flag>();
+    }
+  });
+
+  if (id == -1) {
+    id = GetCurrentDeviceId();
+  }
+
+  if (id < 0 || id >= static_cast<int>(g_device_props.size())) {
+    PADDLE_THROW(phi::errors::OutOfRange(
+        "The device id %d is out of range [0, %d), where %d is the number of "
+        "devices on this machine. Because the device id should be greater than "
+        "or equal to zero and smaller than the number of gpus. Please input "
+        "appropriate device again!",
+        id,
+        static_cast<int>(g_device_props.size()),
+        static_cast<int>(g_device_props.size())));
+  }
+
+  std::call_once(*(g_device_props_init_flags[id]), [&] {
+    PADDLE_ENFORCE_GPU_SUCCESS(musaGetDeviceProperties(&g_device_props[id], id));
+  });
+
+  return g_device_props[id];
+}
+
+void SetDeviceId(int id) {
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
+  PADDLE_RETRY_CUDA_SUCCESS(musaSetDevice(id));
+}
+
+void GpuMemcpyAsync(void *dst,
+                    const void *src,
+                    size_t count,
+                    gpuMemcpyKind kind,
+                    gpuStream_t stream) {
+  PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpyAsync(dst, src, count, kind, stream));
+}
+
+void GpuMemcpySync(void *dst,
+                   const void *src,
+                   size_t count,
+                   gpuMemcpyKind kind) {
+  PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpy(dst, src, count, kind));
+}
+
+void GpuMemcpyPeerAsync(void *dst,
+                        int dst_device,
+                        const void *src,
+                        int src_device,
+                        size_t count,
+                        gpuStream_t stream) {
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      musaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream));
+}
+
+void GpuMemcpyPeerSync(
+    void *dst, int dst_device, const void *src, int src_device, size_t count) {
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      musaMemcpyPeer(dst, dst_device, src, src_device, count));
+}
+
+void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream) {
+  PADDLE_ENFORCE_GPU_SUCCESS(musaMemsetAsync(dst, value, count, stream));
+}
+
+void GpuStreamSync(gpuStream_t stream) {
+  PADDLE_ENFORCE_GPU_SUCCESS(musaStreamSynchronize(stream));
+}
+
+void GpuDestroyStream(gpuStream_t stream) {
+  PADDLE_ENFORCE_GPU_SUCCESS(musaStreamDestroy(stream));
+}
+
+void GpuDeviceSync() { PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize()); }
+
+gpuError_t GpuGetLastError() { return musaGetLastError(); }
+
+bool IsGPUManagedMemorySupported(int dev_id) {
+  PADDLE_ENFORCE_LT(
+      dev_id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   dev_id,
+                                   GetGPUDeviceCount()));
+  return false;
+}
+
+bool IsGPUManagedMemoryOversubscriptionSupported(int dev_id) {
+  PADDLE_ENFORCE_LT(
+      dev_id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   dev_id,
+                                   GetGPUDeviceCount()));
+  return false;
+}
+
+}  // namespace gpu
+}  // namespace backends
+}  // namespace phi
diff --git a/paddle/phi/capi/lib/c_device_context.cc b/paddle/phi/capi/lib/c_device_context.cc
index 96b46fbc0d4ff..21df6c646cd3e 100644
--- a/paddle/phi/capi/lib/c_device_context.cc
+++ b/paddle/phi/capi/lib/c_device_context.cc
@@ -35,7 +35,7 @@ PD_Stream PD_DeviceContextGetStream(const PD_DeviceContext* ctx,
         reinterpret_cast<const phi::CustomContext*>(ctx)->stream());
   } else if (dev_ctx_type == phi::AllocationType::CPU) {
     return nullptr;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   } else if (dev_ctx_type == phi::AllocationType::GPU) {
     return reinterpret_cast<PD_Stream>(
         reinterpret_cast<const phi::GPUContext*>(ctx)->stream());
diff --git a/paddle/phi/capi/lib/c_kernel_context.cc b/paddle/phi/capi/lib/c_kernel_context.cc
index e9fe2aada1f35..7df79117dbae5 100644
--- a/paddle/phi/capi/lib/c_kernel_context.cc
+++ b/paddle/phi/capi/lib/c_kernel_context.cc
@@ -30,7 +30,7 @@ PD_DeviceContext* PD_KernelContextGetDeviceContext(PD_KernelContext* ctx) {
   } else if (dev_ctx_type == phi::AllocationType::CPU) {
     return reinterpret_cast<PD_DeviceContext*>(const_cast<phi::CPUContext*>(
         &kernel_context->GetDeviceContext<phi::CPUContext>()));
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   } else if (dev_ctx_type == phi::AllocationType::GPU) {
     return reinterpret_cast<PD_DeviceContext*>(const_cast<phi::GPUContext*>(
         &kernel_context->GetDeviceContext<phi::GPUContext>()));
diff --git a/paddle/phi/common/backend.h b/paddle/phi/common/backend.h
index 5540592d5013c..3d0bf86c2bca6 100644
--- a/paddle/phi/common/backend.h
+++ b/paddle/phi/common/backend.h
@@ -138,7 +138,7 @@ inline Backend StringToBackend(const char* backend_cstr) {
   } else if (s == std::string("GPUDNN")) {
     return Backend::GPUDNN;
   } else if (s == std::string("KPS")) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     // NOTE(chenweihang) KPS is not yet a complete backend, and it still needs
     // to be converted
     // to GPU in the GPU environment
diff --git a/paddle/phi/common/complex.h b/paddle/phi/common/complex.h
index a4e003dd544ad..6df324c5ead11 100644
--- a/paddle/phi/common/complex.h
+++ b/paddle/phi/common/complex.h
@@ -37,7 +37,7 @@
 #define PADDLE_ALIGN(x) __declspec(align(x))
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 // todo
 #define PADDLE_WITH_CUDA_OR_HIP_COMPLEX
 #endif
@@ -62,7 +62,7 @@ struct PADDLE_ALIGN(sizeof(T) * 2) complex {
 
   HOSTDEVICE complex(T real, T imag) : real(real), imag(imag) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
   template <typename T1>
   HOSTDEVICE inline explicit complex(const thrust::complex<T1>& c) {
diff --git a/paddle/phi/common/float16.h b/paddle/phi/common/float16.h
index 86168d441ded2..094fc5681c04e 100644
--- a/paddle/phi/common/float16.h
+++ b/paddle/phi/common/float16.h
@@ -82,7 +82,7 @@ struct PADDLE_ALIGN(2) float16 {
 // Constructors
 #ifdef PADDLE_CUDA_FP16
   HOSTDEVICE inline explicit float16(const half& h) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #if defined(PADDLE_WITH_HIP) || CUDA_VERSION >= 9000
     x = reinterpret_cast<__half_raw*>(const_cast<half*>(&h))->x;
 #else
diff --git a/paddle/phi/common/memory_utils.cc b/paddle/phi/common/memory_utils.cc
index f9ef606049297..6dc6c1cba468d 100644
--- a/paddle/phi/common/memory_utils.cc
+++ b/paddle/phi/common/memory_utils.cc
@@ -69,7 +69,7 @@ int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id) {
                                                               dev_id);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 void GpuMemoryUsage(size_t* available, size_t* total) {
   return MemoryUtils::Instance().GpuMemoryUsage(available, total);
 }
diff --git a/paddle/phi/common/memory_utils.h b/paddle/phi/common/memory_utils.h
index f6a4afcea2f78..3baf7bbe35624 100644
--- a/paddle/phi/common/memory_utils.h
+++ b/paddle/phi/common/memory_utils.h
@@ -118,7 +118,7 @@ struct MemoryInterface {
   int64_t (*device_memory_stat_current_value)(const std::string& stat_type,
                                               int dev_id);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   /**
    * @brief get the memory usage of current GPU device.
    *
@@ -271,7 +271,7 @@ class MemoryUtils {
     return memory_method_->device_memory_stat_current_value(stat_type, dev_id);
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   void GpuMemoryUsage(size_t* available, size_t* total) {
     CheckMemoryMethod();
     PADDLE_ENFORCE_NOT_NULL(
@@ -372,7 +372,7 @@ void Copy(const Place& dst_place,
 
 int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 void GpuMemoryUsage(size_t* available, size_t* total);
 #endif
 
diff --git a/paddle/phi/common/place.cc b/paddle/phi/common/place.cc
index fe15be4b2b909..0f009806e8c53 100644
--- a/paddle/phi/common/place.cc
+++ b/paddle/phi/common/place.cc
@@ -123,7 +123,7 @@ static int8_t GetCorrectDeviceIdByPlaceType(
   switch (place_type) {
     case paddle::PlaceType::kCPU:
       return 0;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     case paddle::PlaceType::kGPU:
       return phi::backends::gpu::GetCurrentDeviceId();
 #endif
@@ -169,7 +169,7 @@ bool operator==(PlaceType place_type, const Place &place) {
 
 GPUPlace DefaultGPUPlace() {
   return GPUPlace(
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       phi::backends::gpu::GetCurrentDeviceId());
 #else
       0);
diff --git a/paddle/phi/core/compat/convert_utils.cc b/paddle/phi/core/compat/convert_utils.cc
index 947c7fb45c5fc..9792f64c5c46d 100644
--- a/paddle/phi/core/compat/convert_utils.cc
+++ b/paddle/phi/core/compat/convert_utils.cc
@@ -57,7 +57,7 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) {
   switch (backend) {
     case phi::Backend::CPU:
       return phi::CPUPlace();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     case phi::Backend::GPU:
       return phi::GPUPlace(
           set_device_id ? phi::backends::gpu::GetCurrentDeviceId() : 0);
@@ -66,7 +66,7 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) {
     case phi::Backend::ONEDNN:
       return phi::CPUPlace();
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     case phi::Backend::GPUDNN:
       return phi::GPUPlace(
           set_device_id ? phi::backends::gpu::GetCurrentDeviceId() : 0);
@@ -77,7 +77,7 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) {
           set_device_id ? phi::backends::xpu::GetXPUCurrentDeviceId() : 0);
 #endif
     case phi::Backend::KPS:
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       return phi::GPUPlace(
           set_device_id ? phi::backends::gpu::GetCurrentDeviceId() : 0);
 #elif defined(PADDLE_WITH_XPU_KP)
diff --git a/paddle/phi/core/cuda_stream.h b/paddle/phi/core/cuda_stream.h
index b27770b081433..58f08a2a36b57 100644
--- a/paddle/phi/core/cuda_stream.h
+++ b/paddle/phi/core/cuda_stream.h
@@ -28,6 +28,11 @@ using gpuStream_t = cudaStream_t;
 using gpuStream_t = hipStream_t;
 #endif
 
+#ifdef PADDLE_WITH_MUSA
+#include <musa_runtime.h>
+using gpuStream_t = musaStream_t;
+#endif
+
 #include "glog/logging.h"
 
 #include "paddle/phi/core/enforce.h"
@@ -73,6 +78,9 @@ class CUDAStream {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreateWithPriority(
         &stream, static_cast<unsigned int>(flag), priority));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaStreamCreateWithPriority(
+        &stream, static_cast<unsigned int>(flag), priority));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreateWithPriority(
         &stream, static_cast<unsigned int>(flag), priority));
@@ -92,6 +100,8 @@ class CUDAStream {
       backends::gpu::GPUDeviceGuard guard(place_.device);
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(raw_stream()));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(musaStreamDestroy(raw_stream()));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(raw_stream()));
 #endif
@@ -112,6 +122,14 @@ class CUDAStream {
     if (err == hipErrorNotReady) {
       return false;
     }
+#elif defined(PADDLE_WITH_MUSA)
+    musaError_t err = musaStreamQuery(raw_stream());
+    if (err == musaSuccess) {
+      return true;
+    }
+    if (err == musaErrorNotReady) {
+      return false;
+    }
 #else
     cudaError_t err = cudaStreamQuery(raw_stream());
     if (err == cudaSuccess) {
diff --git a/paddle/phi/core/enforce.h b/paddle/phi/core/enforce.h
index 6b98fd0488595..d4ae30598551c 100644
--- a/paddle/phi/core/enforce.h
+++ b/paddle/phi/core/enforce.h
@@ -90,7 +90,7 @@ limitations under the License. */
 // Note: these headers for simplify demangle type string
 #include "paddle/phi/core/type_defs.h"
 // Note: this header for simplify HIP and CUDA type string
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/phi/backends/gpu/gpu_types.h"
 #endif
 
diff --git a/paddle/phi/core/flags.cc b/paddle/phi/core/flags.cc
index 0c581fb09919f..ebb54c8173917 100644
--- a/paddle/phi/core/flags.cc
+++ b/paddle/phi/core/flags.cc
@@ -14,7 +14,7 @@
 // limitations under the License.
 
 #include "paddle/phi/core/flags.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/phi/backends/gpu/cuda/cudnn_workspace_helper.h"
 #endif
 
@@ -120,7 +120,7 @@ PHI_DEFINE_EXPORTED_bool(
 
 // NOTE(zhiqiu): better to share the flags, otherwise we will have too many
 // flags.
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 /**
  * CUDA related related FLAG
@@ -215,7 +215,7 @@ PHI_DEFINE_EXPORTED_bool(
     true,
     "Whether enable api kernel fallback to CPU one when not found");
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 /**
  * CUDNN related FLAG
  * Name: FLAGS_cudnn_deterministic
@@ -322,7 +322,7 @@ PHI_DEFINE_EXPORTED_bool(
     "batch_norm, default is False.");
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 /**
  * NCCL related FLAG
@@ -541,7 +541,7 @@ PHI_DEFINE_EXPORTED_double(
 
 // NOTE(zhiqiu): better to share the flags, otherwise we will have too many
 // flags.
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
     defined(PADDLE_WITH_CUSTOM_DEVICE) || defined(PADDLE_WITH_XPU)
 
 /**
@@ -785,7 +785,7 @@ PHI_DEFINE_EXPORTED_string(tracer_mkldnn_ops_off,
  * Example:
  * Note: Check kernel launch status after every kernel compute.
  */
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PHI_DEFINE_EXPORTED_bool(
     check_kernel_launch,
     false,
@@ -800,7 +800,7 @@ PHI_DEFINE_EXPORTED_bool(
  * Example:
  * Note: Disable cudnn in conv2d.
  */
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PHI_DEFINE_EXPORTED_bool(conv2d_disable_cudnn,
                          false,
                          "Disable cudnn in conv2d");
@@ -1127,7 +1127,7 @@ PHI_DEFINE_EXPORTED_bool(gpugraph_debug_gpu_memory,
  * Example:
  * Note: nccl blocking wait.
  */
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PHI_DEFINE_EXPORTED_bool(nccl_blocking_wait, false, "nccl blocking wait");
 #endif
 
diff --git a/paddle/phi/core/generator.cc b/paddle/phi/core/generator.cc
index 4ed25af0814df..06ebdc1c0801c 100644
--- a/paddle/phi/core/generator.cc
+++ b/paddle/phi/core/generator.cc
@@ -63,7 +63,7 @@ const std::shared_ptr<Generator>& DefaultXPUGenerator(int64_t device_id) {
 }
 
 const std::shared_ptr<Generator>& DefaultCUDAGenerator(int64_t device_id) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
   static int64_t num_cuda_devices = -1;
   static std::once_flag num_devices_init_flag;
@@ -278,7 +278,7 @@ uint64_t Generator::Random64() {
 
 std::pair<uint64_t, uint64_t> Generator::IncrementOffset(
     uint64_t increment_offset) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   std::lock_guard<std::mutex> lock(this->mu_);
   uint64_t cur_offset = this->state_.thread_offset;
   this->state_.thread_offset += increment_offset;
diff --git a/paddle/phi/core/kernel_factory.cc b/paddle/phi/core/kernel_factory.cc
index 6511efa0152ee..5f9a40625fac9 100644
--- a/paddle/phi/core/kernel_factory.cc
+++ b/paddle/phi/core/kernel_factory.cc
@@ -120,7 +120,7 @@ const Kernel& KernelFactory::SelectKernelWithGPUDNN(
     return empty_kernel;
   }
   KernelKey kernel_key = KernelKey(const_kernel_key);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (kernel_key.backend() == Backend::GPUDNN) {
     auto kernel_iter = iter->second.find(
         {Backend::GPUDNN, phi::DataLayout::ALL_LAYOUT, kernel_key.dtype()});
@@ -221,7 +221,7 @@ KernelResult KernelFactory::SelectKernelOrThrowError(
   KernelKey kernel_key = KernelKey(const_kernel_key.backend(),
                                    phi::DataLayout::ALL_LAYOUT,
                                    const_kernel_key.dtype());
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (kernel_key.backend() == Backend::GPUDNN) {
     auto kernel_iter = iter->second.find(
         {Backend::GPUDNN, phi::DataLayout::ALL_LAYOUT, kernel_key.dtype()});
diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h
index f4e021f7269a7..984b28cf05316 100644
--- a/paddle/phi/core/kernel_registry.h
+++ b/paddle/phi/core/kernel_registry.h
@@ -60,7 +60,7 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
 #if defined(PADDLE_WITH_MKLDNN)
           || arg_type == std::type_index(typeid(const OneDNNContext&))
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
           || arg_type == std::type_index(typeid(const GPUContext&))
 #elif defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP)
           || arg_type == std::type_index(typeid(const XPUContext&))
@@ -1401,7 +1401,7 @@ struct KernelRegistrar {
                                             meta_kernel_fn,        \
                                             BACKEND_LIST)
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #define _DEVICE GPU,
 #elif defined(PADDLE_WITH_XPU)
 #define _DEVICE XPU,
diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h
index f4dc4636bdde3..d768ba85272aa 100644
--- a/paddle/phi/core/kernel_utils.h
+++ b/paddle/phi/core/kernel_utils.h
@@ -273,7 +273,7 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
   /* DeviceContext Helpers */
 
   PD_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(CPUContext);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   PD_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(GPUContext);
 #endif
 #ifdef PADDLE_WITH_XPU
diff --git a/paddle/phi/core/mixed_vector.cc b/paddle/phi/core/mixed_vector.cc
index 857bd546befcd..aba6a0f7bfca2 100644
--- a/paddle/phi/core/mixed_vector.cc
+++ b/paddle/phi/core/mixed_vector.cc
@@ -33,7 +33,7 @@ template <typename T>
 void CopyToCPUHelper(std::vector<T> *cpu_,
                      phi::Allocator::AllocationPtr *gpu_,
                      size_t *gpu_memory_size_) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   // COPY GPU Data To CPU
   auto *dev_ctx = static_cast<phi::GPUContext *>(
       phi::DeviceContextPool::Instance().Get((*gpu_)->place()));
@@ -55,7 +55,7 @@ void CopyCPUDataToCUDAHelper(std::vector<T> *cpu_,
                              phi::Allocator::AllocationPtr *gpu_,
                              size_t *gpu_memory_size_,
                              const phi::Place &place) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   void *src = cpu_->data();
   *gpu_memory_size_ = cpu_->size() * sizeof(T);  // sizeof(T)
   (*gpu_) = memory_utils::Alloc(place, *gpu_memory_size_);
diff --git a/paddle/phi/core/string_tensor.cc b/paddle/phi/core/string_tensor.cc
index 0e465982ba429..98ad70622b943 100644
--- a/paddle/phi/core/string_tensor.cc
+++ b/paddle/phi/core/string_tensor.cc
@@ -114,7 +114,7 @@ void StringTensor::init_holder() {
   if (place.GetType() == phi::AllocationType::CPU) {
     std::memset(ptr, 0, bytes_size);
   } else if (place.GetType() == phi::AllocationType::GPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #ifdef PADDLE_WITH_HIP
     hipMemset(ptr, 0, bytes_size);
 #else
diff --git a/paddle/phi/core/tensor_utils.cc b/paddle/phi/core/tensor_utils.cc
index abe44d3e2550b..b4a1343423103 100644
--- a/paddle/phi/core/tensor_utils.cc
+++ b/paddle/phi/core/tensor_utils.cc
@@ -58,7 +58,7 @@ void Copy(const Context& dev_ctx,
 #ifdef PADDLE_WITH_MKLDNN
     dst->set_layout(src.layout());
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   } else if (dst_place.GetType() == AllocationType::GPU ||
              dst_place.GetType() == AllocationType::GPUPINNED) {
     dst_ptr = dev_ctx.Alloc(
@@ -99,7 +99,7 @@ void Copy(const Context& dev_ctx,
   if (src_place.GetType() == AllocationType::CPU &&
       dst_place.GetType() == AllocationType::CPU) {
     memory_utils::Copy(src_place, dst_ptr, src_place, src_ptr, size);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   } else if ((src_place.GetType() == AllocationType::CPU ||
               src_place.GetType() == AllocationType::GPUPINNED) &&  // NOLINT
              (dst_place.GetType() == AllocationType::CPU ||
@@ -386,7 +386,7 @@ template void Copy(const DeviceContext& dev_ctx,
                    bool blocking,
                    TensorArray* dst);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 template void Copy(const GPUContext& dev_ctx,
                    const DenseTensor& src,
                    Place dst_place,
@@ -468,7 +468,7 @@ void TensorFromVector(const std::vector<T>& src,
   if (dst_place.GetType() == AllocationType::CPU) {
     memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   else if (dst_place.GetType() == AllocationType::GPU) {  // NOLINT
     memory_utils::Copy(dst_place,
                        dst_ptr,
@@ -522,7 +522,7 @@ void TensorFromVector(const std::vector<bool>& src,
   if (dst_place.GetType() == AllocationType::CPU) {
     memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   else if (dst_place.GetType() == AllocationType::GPU) {  // NOLINT
     memory_utils::Copy(dst_place,
                        dst_ptr,
@@ -614,7 +614,7 @@ void TensorFromArray(const T* src,
   if (dst_place.GetType() == AllocationType::CPU) {
     memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   else if (dst_place.GetType() == AllocationType::GPU) {  // NOLINT
     memory_utils::Copy(dst_place,
                        dst_ptr,
@@ -714,7 +714,7 @@ void TensorToVector(const phi::DenseTensor& src,
   if (src.place().GetType() == AllocationType::CPU) {
     memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   else if (src.place().GetType() == AllocationType::GPU) {  // NOLINT
     memory_utils::Copy(dst_place,
                        dst_ptr,
@@ -756,7 +756,7 @@ void TensorToVector(const phi::DenseTensor& src,
   if (src.place().GetType() == AllocationType::CPU) {
     memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   else if (src.place().GetType() == AllocationType::GPU) {  // NOLINT
     memory_utils::Copy(dst_place,
                        dst_ptr,
diff --git a/paddle/phi/core/utils/type_info.cc b/paddle/phi/core/utils/type_info.cc
index 2a554525024c8..648ef5c587126 100644
--- a/paddle/phi/core/utils/type_info.cc
+++ b/paddle/phi/core/utils/type_info.cc
@@ -60,12 +60,12 @@ template class TypeInfoTraits<phi::TensorBase,
                               phi::distributed::auto_parallel::DistTensor>;
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
     defined(PADDLE_WITH_XPU_KP)
 template class TypeInfoTraits<phi::DeviceContext, GPUContext>;
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 template class TypeInfoTraits<phi::DeviceContext, GPUPinnedContext>;
 #endif
 
diff --git a/paddle/phi/core/utils/visit_place.h b/paddle/phi/core/utils/visit_place.h
index 6318b17647cd6..34a8fca61fbbe 100644
--- a/paddle/phi/core/utils/visit_place.h
+++ b/paddle/phi/core/utils/visit_place.h
@@ -25,7 +25,7 @@ typename Visitor::result_type VisitPlace(const phi::Place& place,
                                          const Visitor& visitor) {
   switch (place.GetType()) {
     case phi::AllocationType::GPU: {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       phi::GPUPlace p(place.GetDeviceId());
       return visitor(p);
 #else
@@ -35,7 +35,7 @@ typename Visitor::result_type VisitPlace(const phi::Place& place,
 #endif
     }
     case phi::AllocationType::GPUPINNED: {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       phi::GPUPinnedPlace p;
       return visitor(p);
 #else
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index 71bbfaa333a0a..88c273d6934ee 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -858,7 +858,7 @@ void CoalesceTensorInferMeta(const std::vector<const MetaTensor*>& input,
     size_of_dtype = phi::SizeOf(dtype);
   }
   if (config.is_runtime) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     int64_t numel = 0;
     for (size_t i = 0; i < input.size(); ++i) {
       const auto& dim = input[i]->dims();
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index 25367be206139..e28210cfca7e4 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -117,7 +117,7 @@ file(
   "xpu/*.cc" "legacy/xpu/*.cc" "selected_rows/xpu/*.cc" "fusion/xpu/*.cc"
   "sparse/xpu/*.cc")
 
-if(WITH_GPU OR WITH_ROCM)
+if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
   collect_srcs(kernels_srcs SRCS ${kernel_cu})
   kernel_declare("${kernel_cu}")
 endif()
diff --git a/paddle/phi/kernels/activation_kernel.cc b/paddle/phi/kernels/activation_kernel.cc
index f157c5e054bfb..9626621ae8657 100644
--- a/paddle/phi/kernels/activation_kernel.cc
+++ b/paddle/phi/kernels/activation_kernel.cc
@@ -32,7 +32,7 @@ using complex128 = ::phi::dtype::complex<double>;
 
 PD_REGISTER_KERNEL(relu6, CPU, ALL_LAYOUT, phi::Relu6Kernel, float, double) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(relu6,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/assign_kernel.cc b/paddle/phi/kernels/assign_kernel.cc
index db30ec7389619..c44b6333154cc 100644
--- a/paddle/phi/kernels/assign_kernel.cc
+++ b/paddle/phi/kernels/assign_kernel.cc
@@ -135,7 +135,7 @@ PD_REGISTER_KERNEL(assign_value,
                    int8_t,
                    int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL_FOR_ALL_DTYPE(assign,
                                  GPU,
                                  ALL_LAYOUT,
diff --git a/paddle/phi/kernels/check_memory_continue_kernel.cc b/paddle/phi/kernels/check_memory_continue_kernel.cc
index 6e496a355302f..9f4b51281cd37 100644
--- a/paddle/phi/kernels/check_memory_continue_kernel.cc
+++ b/paddle/phi/kernels/check_memory_continue_kernel.cc
@@ -88,7 +88,7 @@ PD_REGISTER_KERNEL(check_memory_continue,
                    float,
                    double) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(check_memory_continue,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/dist_grad_kernel.cc b/paddle/phi/kernels/dist_grad_kernel.cc
index 17c24fa905b5c..442290c3648e2 100644
--- a/paddle/phi/kernels/dist_grad_kernel.cc
+++ b/paddle/phi/kernels/dist_grad_kernel.cc
@@ -97,7 +97,7 @@ void DistGradKernel(const Context& dev_ctx,
 PD_REGISTER_KERNEL(
     dist_grad, CPU, ALL_LAYOUT, phi::DistGradKernel, float, double) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(
     dist_grad, GPU, ALL_LAYOUT, phi::DistGradKernel, float, double) {}
 #endif
diff --git a/paddle/phi/kernels/empty_kernel.cc b/paddle/phi/kernels/empty_kernel.cc
index 8df5e9a543eb2..54449200ae4b2 100644
--- a/paddle/phi/kernels/empty_kernel.cc
+++ b/paddle/phi/kernels/empty_kernel.cc
@@ -74,7 +74,7 @@ PD_REGISTER_KERNEL(empty_like,
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(empty,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/flatten_grad_kernel.cc b/paddle/phi/kernels/flatten_grad_kernel.cc
index 42d137ba4f419..3ecef871d211d 100644
--- a/paddle/phi/kernels/flatten_grad_kernel.cc
+++ b/paddle/phi/kernels/flatten_grad_kernel.cc
@@ -46,7 +46,7 @@ PD_REGISTER_KERNEL(flatten_grad,
                    int,
                    int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(flatten_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/flatten_kernel.cc b/paddle/phi/kernels/flatten_kernel.cc
index dc61e6a650efa..6b22ac7518179 100644
--- a/paddle/phi/kernels/flatten_kernel.cc
+++ b/paddle/phi/kernels/flatten_kernel.cc
@@ -75,7 +75,7 @@ PD_REGISTER_KERNEL(flatten,
                    int,
                    int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(flatten_infer,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/full_kernel.cc b/paddle/phi/kernels/full_kernel.cc
index 38beafbfa51b9..982b6a396c2a8 100644
--- a/paddle/phi/kernels/full_kernel.cc
+++ b/paddle/phi/kernels/full_kernel.cc
@@ -49,7 +49,7 @@ PD_REGISTER_KERNEL(full_batch_size_like,
                    bool) {
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
 }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(full_batch_size_like,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt
index 999625cf3dfb4..c4bdf29e03949 100644
--- a/paddle/phi/kernels/funcs/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/CMakeLists.txt
@@ -8,7 +8,7 @@ file(
   GLOB func_cc_srcs
   RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
   "*.cc")
-if(WITH_GPU OR WITH_ROCM)
+if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
   file(
     GLOB func_cu_srcs
     RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
diff --git a/paddle/phi/kernels/funcs/blas/blas.h b/paddle/phi/kernels/funcs/blas/blas.h
index 140eca890480f..2ea7a306f16fd 100644
--- a/paddle/phi/kernels/funcs/blas/blas.h
+++ b/paddle/phi/kernels/funcs/blas/blas.h
@@ -360,7 +360,7 @@ class Blas {
             T* B,
             int ldb) const;
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   template <typename T>
   void BatchedGETRF(int n, T** a, int* ipiv, int* info, int batch_size) const;
 
@@ -543,7 +543,7 @@ class BlasT : private Blas<DeviceContext> {
     Base()->template TRSM<T>(args...);
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   template <typename... ARGS>
   void BatchedGETRF(ARGS... args) const {
     Base()->template BatchedGETRF<T>(args...);
diff --git a/paddle/phi/kernels/funcs/detail/strided_memcpy.h b/paddle/phi/kernels/funcs/detail/strided_memcpy.h
index 0cd07fdfd0e1a..d731c4f89b751 100644
--- a/paddle/phi/kernels/funcs/detail/strided_memcpy.h
+++ b/paddle/phi/kernels/funcs/detail/strided_memcpy.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/device_context.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #endif
 
@@ -41,7 +41,7 @@ struct StridedMemcpyFunctor<T, 0> {
       auto& cpu_place = place;
       memory_utils::Copy(cpu_place, dst, cpu_place, src, sizeof(T));
     } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       auto& gpu_place = place;
       auto& cuda_ctx = reinterpret_cast<const phi::GPUContext&>(dev_ctx);
       memory_utils::Copy(
@@ -68,7 +68,7 @@ struct StridedMemcpyFunctor<T, 1> {
       memory_utils::Copy(
           cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim[0]);
     } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       auto& gpu_place = place;
       auto& cuda_ctx = reinterpret_cast<const phi::GPUContext&>(dev_ctx);
       memory_utils::Copy(gpu_place,
diff --git a/paddle/phi/kernels/funcs/layer_norm_util.h b/paddle/phi/kernels/funcs/layer_norm_util.h
index 7f7b2be551a57..7a4ea0bb695bd 100644
--- a/paddle/phi/kernels/funcs/layer_norm_util.h
+++ b/paddle/phi/kernels/funcs/layer_norm_util.h
@@ -36,7 +36,7 @@ struct RowwiseMean2D {
                   DenseTensor* vec);
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 template <typename T>
 class RowwiseMean2D<phi::GPUContext, T> {
  public:
@@ -93,7 +93,7 @@ struct ColwiseSum2D {
                   DenseTensor* vec);
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 template <typename T>
 class ColwiseSum2D<phi::GPUContext, T> {
  public:
diff --git a/paddle/phi/kernels/funcs/math_function.cc b/paddle/phi/kernels/funcs/math_function.cc
index 10d18cc958ae3..7c2fd866e3b91 100644
--- a/paddle/phi/kernels/funcs/math_function.cc
+++ b/paddle/phi/kernels/funcs/math_function.cc
@@ -239,7 +239,7 @@ void set_constant(const phi::DeviceContext& context,
     return;
   }
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   // tensor->place().apply_visitor(func);
   phi::VisitPlace(tensor->place(), func);
 #elif defined(PADDLE_WITH_XPU)
diff --git a/paddle/phi/kernels/funcs/math_function.h b/paddle/phi/kernels/funcs/math_function.h
index b42714e80db2f..bce782049b8a8 100644
--- a/paddle/phi/kernels/funcs/math_function.h
+++ b/paddle/phi/kernels/funcs/math_function.h
@@ -25,7 +25,7 @@ limitations under the License. */
 namespace phi {
 namespace funcs {
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 template <typename T>
 void BatchTranspose(T* output,
                     const T* input,
diff --git a/paddle/phi/kernels/funcs/pooling.h b/paddle/phi/kernels/funcs/pooling.h
index bf2409d2e502b..3d95ef45eaae6 100644
--- a/paddle/phi/kernels/funcs/pooling.h
+++ b/paddle/phi/kernels/funcs/pooling.h
@@ -23,7 +23,7 @@ limitations under the License. */
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/core/macros.h"  // import FLT_MAX
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/phi/backends/gpu/gpu_decls.h"
 #endif
 
@@ -115,7 +115,7 @@ HOSTDEVICE inline int AdaptEndIndex(int ph, int input_size, int output_size) {
  * This is different from average pooling. So we rewrite the max_pool_grad:
  * MaxPool2dGradFunctor, MaxPool3dGradFunctor.
  */
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 template <typename PoolProcess, typename T>
 class Pool2dDirectCUDAFunctor {
  public:
@@ -211,7 +211,7 @@ class MaxPool2dGradFunctor {
                   DenseTensor* input_grad);
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 template <typename PoolProcess, typename T>
 class Pool3dDirectCUDAFunctor {
  public:
diff --git a/paddle/phi/kernels/funcs/select_impl.cu.h b/paddle/phi/kernels/funcs/select_impl.cu.h
index 96b7942cf2709..2976968d07b70 100644
--- a/paddle/phi/kernels/funcs/select_impl.cu.h
+++ b/paddle/phi/kernels/funcs/select_impl.cu.h
@@ -15,7 +15,7 @@
 #pragma once
 
 // CUDA and HIP use same api
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
diff --git a/paddle/phi/kernels/funcs/softmax.h b/paddle/phi/kernels/funcs/softmax.h
index 80805eb6d76f6..1198b80a9e879 100644
--- a/paddle/phi/kernels/funcs/softmax.h
+++ b/paddle/phi/kernels/funcs/softmax.h
@@ -37,7 +37,7 @@ class SoftmaxGradFunctor {
                   phi::DenseTensor* x_grad);
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 template <typename T, typename DeviceContext>
 class SoftmaxCUDNNFunctor {
  public:
diff --git a/paddle/phi/kernels/funcs/strided_memcpy.h b/paddle/phi/kernels/funcs/strided_memcpy.h
index de38e40d317e1..0e9dc896c3629 100644
--- a/paddle/phi/kernels/funcs/strided_memcpy.h
+++ b/paddle/phi/kernels/funcs/strided_memcpy.h
@@ -56,7 +56,7 @@ inline void CopyWithContext(const Context& ctx,
                             const Place& src_place,
                             const void* src,
                             size_t num) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
     defined(PADDLE_WITH_CUSTOM_DEVICE)
   memory_utils::Copy(dst_place, dst, src_place, src, num, ctx.stream());
 #else
diff --git a/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_utils.h b/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_utils.h
index c1d60cbffee2f..418fa8bf55ce9 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_utils.h
+++ b/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_utils.h
@@ -25,7 +25,7 @@
 
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 #ifdef PADDLE_WITH_HIP
 #define WARP_SIZE 64
diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
index 1a07e5f0d4909..5c2d76be35992 100644
--- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -611,7 +611,7 @@ void BatchNormKernel(const Context &ctx,
   }
   epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
 
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) 
 // TODO(wangran16): wait for MIOpen to improve the performance of BN
 // mode_ = miopenBNSpatial;
 #elif CUDNN_VERSION_MIN(7, 0, 1)
diff --git a/paddle/phi/kernels/gpu/reduce.h b/paddle/phi/kernels/gpu/reduce.h
index cc3cad38f46fb..428f105c9743a 100644
--- a/paddle/phi/kernels/gpu/reduce.h
+++ b/paddle/phi/kernels/gpu/reduce.h
@@ -15,7 +15,7 @@
 #pragma once
 
 // CUDA and HIP use same api
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
     defined(PADDLE_WITH_XPU_KP)
 
 #include "paddle/phi/core/visit_type.h"
diff --git a/paddle/phi/kernels/gpu/reduce_grad.h b/paddle/phi/kernels/gpu/reduce_grad.h
index 7e01c1ae84391..c3c918c21cb35 100644
--- a/paddle/phi/kernels/gpu/reduce_grad.h
+++ b/paddle/phi/kernels/gpu/reduce_grad.h
@@ -15,7 +15,7 @@
 #pragma once
 
 // CUDA and HIP use same api
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 #include <algorithm>
 #include <cmath>
diff --git a/paddle/phi/kernels/group_norm_kernel.h b/paddle/phi/kernels/group_norm_kernel.h
index f3e39ddbeb328..ec134fa47eecd 100644
--- a/paddle/phi/kernels/group_norm_kernel.h
+++ b/paddle/phi/kernels/group_norm_kernel.h
@@ -33,7 +33,7 @@ void GroupNormKernel(const Context& dev_ctx,
                      DenseTensor* mean,
                      DenseTensor* variance);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 template <typename T, typename AccT = T>
 class GroupNormDirectCUDAFunctor {
  public:
diff --git a/paddle/phi/kernels/impl/segment_pool_kernel_impl.h b/paddle/phi/kernels/impl/segment_pool_kernel_impl.h
index 216d5e6100d6c..82b99b07a8927 100644
--- a/paddle/phi/kernels/impl/segment_pool_kernel_impl.h
+++ b/paddle/phi/kernels/impl/segment_pool_kernel_impl.h
@@ -64,7 +64,7 @@ void SegmentKernelLaunchHelper(const Context& dev_ctx,
     phi::funcs::SetConstant<Context, T> set_zero;
     set_zero(dev_ctx, out, static_cast<T>(0));
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (!cpu_place) {
     DenseTensor length;
     length.Resize(phi::make_ddim({1}));
diff --git a/paddle/phi/kernels/impl/warpctc_kernel_impl.h b/paddle/phi/kernels/impl/warpctc_kernel_impl.h
index 4b4bd6f5143dd..015c7a0764a2b 100644
--- a/paddle/phi/kernels/impl/warpctc_kernel_impl.h
+++ b/paddle/phi/kernels/impl/warpctc_kernel_impl.h
@@ -205,7 +205,7 @@ class WarpCTCFunctor {
     warpctc_version_ = phi::dynload::get_warpctc_version();
 
     if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       options_.loc = CTC_GPU;
       options_.stream =
           reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
diff --git a/paddle/phi/kernels/impl/warprnnt_kernel_impl.h b/paddle/phi/kernels/impl/warprnnt_kernel_impl.h
index f51041285aaee..f36ec9c007eda 100644
--- a/paddle/phi/kernels/impl/warprnnt_kernel_impl.h
+++ b/paddle/phi/kernels/impl/warprnnt_kernel_impl.h
@@ -139,7 +139,7 @@ class WarpRNNTFunctor {
     rnntStatus_t status = RNNT_STATUS_UNKNOWN_ERROR;
     bool gpu = false;
     if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       gpu = true;
 #else
       PADDLE_THROW(errors::PreconditionNotMet(
@@ -208,7 +208,7 @@ class WarpRNNTFunctor {
     options_.batch_first = true;
 
     if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       options_.loc = RNNT_GPU;
       options_.stream =
           reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
diff --git a/paddle/phi/kernels/is_empty_kernel.cc b/paddle/phi/kernels/is_empty_kernel.cc
index 4b86f2dfe6950..f420a419f5c67 100644
--- a/paddle/phi/kernels/is_empty_kernel.cc
+++ b/paddle/phi/kernels/is_empty_kernel.cc
@@ -43,7 +43,7 @@ PD_REGISTER_KERNEL(is_empty,
   kernel->OutputAt(0).SetDataType(phi::DataType::BOOL);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(is_empty,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/kps/elementwise_kernel.cu b/paddle/phi/kernels/kps/elementwise_kernel.cu
index e88714c370be9..d72d051ba1bf8 100644
--- a/paddle/phi/kernels/kps/elementwise_kernel.cu
+++ b/paddle/phi/kernels/kps/elementwise_kernel.cu
@@ -87,7 +87,7 @@ void ElementwisePowKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 PD_REGISTER_KERNEL(maximum,
                    KPS,
diff --git a/paddle/phi/kernels/layer_norm_kernel.h b/paddle/phi/kernels/layer_norm_kernel.h
index 2fddcec2278c9..ee8a324e09b4f 100644
--- a/paddle/phi/kernels/layer_norm_kernel.h
+++ b/paddle/phi/kernels/layer_norm_kernel.h
@@ -30,7 +30,7 @@ void LayerNormKernel(const Context& ctx,
                      DenseTensor* mean,
                      DenseTensor* variance);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 template <typename T, typename U>
 class LayerNormDirectCUDAFunctor {
  public:
diff --git a/paddle/phi/kernels/memcpy_kernel.cc b/paddle/phi/kernels/memcpy_kernel.cc
index 49d69a23fedd1..62a6cbc8ea840 100644
--- a/paddle/phi/kernels/memcpy_kernel.cc
+++ b/paddle/phi/kernels/memcpy_kernel.cc
@@ -117,7 +117,7 @@ void MemcpyKernel(const Context& dev_ctx,
       dev_ctx.HostAlloc(out, out->dtype());
       Copy(dev_ctx, x, CPUPlace(), true, out);
       break;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     case 1: /* CUDAPlace */
       dev_ctx.Alloc(out, x.dtype());
       Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out);
@@ -162,7 +162,7 @@ PD_REGISTER_KERNEL_FOR_ALL_DTYPE(memcpy,
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL_FOR_ALL_DTYPE(memcpy_h2d,
                                  GPU,
                                  ALL_LAYOUT,
diff --git a/paddle/phi/kernels/npu_identity_kernel.cc b/paddle/phi/kernels/npu_identity_kernel.cc
index 89a0c63c8a495..12d933af78733 100644
--- a/paddle/phi/kernels/npu_identity_kernel.cc
+++ b/paddle/phi/kernels/npu_identity_kernel.cc
@@ -62,7 +62,7 @@ PD_REGISTER_KERNEL(npu_identity,
                    bool,
                    phi::dtype::float16) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(npu_identity,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/prod_kernel.cc b/paddle/phi/kernels/prod_kernel.cc
index ea3faaebd9582..4e5546ca0df01 100644
--- a/paddle/phi/kernels/prod_kernel.cc
+++ b/paddle/phi/kernels/prod_kernel.cc
@@ -40,7 +40,7 @@ PD_REGISTER_KERNEL(prod_infer,
                    int,
                    int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(prod_infer,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/reduce_all_kernel.cc b/paddle/phi/kernels/reduce_all_kernel.cc
index 9e799f0d219fc..3b33c7f665e79 100644
--- a/paddle/phi/kernels/reduce_all_kernel.cc
+++ b/paddle/phi/kernels/reduce_all_kernel.cc
@@ -40,7 +40,7 @@ void AllKernel(const Context& dev_ctx,
 
 PD_REGISTER_KERNEL(all, CPU, ALL_LAYOUT, phi::AllKernel, bool) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(all, GPU, ALL_LAYOUT, phi::AllKernel, bool) {}
 #endif
 
diff --git a/paddle/phi/kernels/reduce_amax_kernel.cc b/paddle/phi/kernels/reduce_amax_kernel.cc
index 87e432c5c20a7..466d0497b2d8e 100644
--- a/paddle/phi/kernels/reduce_amax_kernel.cc
+++ b/paddle/phi/kernels/reduce_amax_kernel.cc
@@ -34,7 +34,7 @@ void AMaxKernel(const Context& dev_ctx,
 PD_REGISTER_KERNEL(
     amax, CPU, ALL_LAYOUT, phi::AMaxKernel, float, double, int, int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(
     amax, GPU, ALL_LAYOUT, phi::AMaxKernel, float, double, int, int64_t) {}
 #endif
diff --git a/paddle/phi/kernels/reduce_amin_kernel.cc b/paddle/phi/kernels/reduce_amin_kernel.cc
index a355da64230dc..a30ab4a91956d 100644
--- a/paddle/phi/kernels/reduce_amin_kernel.cc
+++ b/paddle/phi/kernels/reduce_amin_kernel.cc
@@ -34,7 +34,7 @@ void AMinKernel(const Context& dev_ctx,
 PD_REGISTER_KERNEL(
     amin, CPU, ALL_LAYOUT, phi::AMinKernel, float, double, int, int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(
     amin, GPU, ALL_LAYOUT, phi::AMinKernel, float, double, int, int64_t) {}
 #endif
diff --git a/paddle/phi/kernels/reduce_any_kernel.cc b/paddle/phi/kernels/reduce_any_kernel.cc
index 9d162f8e02033..0b6f4028b62ac 100644
--- a/paddle/phi/kernels/reduce_any_kernel.cc
+++ b/paddle/phi/kernels/reduce_any_kernel.cc
@@ -33,7 +33,7 @@ void AnyKernel(const Context& dev_ctx,
 
 PD_REGISTER_KERNEL(any, CPU, ALL_LAYOUT, phi::AnyKernel, bool) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(any, GPU, ALL_LAYOUT, phi::AnyKernel, bool) {}
 #endif
 
diff --git a/paddle/phi/kernels/reduce_mean_kernel.cc b/paddle/phi/kernels/reduce_mean_kernel.cc
index 21b02412d31ca..fb8ea2f97bbea 100644
--- a/paddle/phi/kernels/reduce_mean_kernel.cc
+++ b/paddle/phi/kernels/reduce_mean_kernel.cc
@@ -41,7 +41,7 @@ PD_REGISTER_KERNEL(mean,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(mean,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/reduce_sum_kernel.cc b/paddle/phi/kernels/reduce_sum_kernel.cc
index de9688d4e60aa..59d192014da1d 100644
--- a/paddle/phi/kernels/reduce_sum_kernel.cc
+++ b/paddle/phi/kernels/reduce_sum_kernel.cc
@@ -53,7 +53,7 @@ PD_REGISTER_KERNEL(sum,
   kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(sum,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/reverse_kernel.cc b/paddle/phi/kernels/reverse_kernel.cc
index 771acacedf024..d8c8f5a966376 100644
--- a/paddle/phi/kernels/reverse_kernel.cc
+++ b/paddle/phi/kernels/reverse_kernel.cc
@@ -61,7 +61,7 @@ PD_REGISTER_KERNEL(reverse_array,
                    float,
                    double) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 PD_REGISTER_KERNEL(reverse_array,
                    GPU,
diff --git a/paddle/phi/kernels/selected_rows/activation_kernel.cc b/paddle/phi/kernels/selected_rows/activation_kernel.cc
index 4a27d0763a235..6bd55f701bb33 100644
--- a/paddle/phi/kernels/selected_rows/activation_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/activation_kernel.cc
@@ -49,7 +49,7 @@ PD_REGISTER_KERNEL(
 PD_REGISTER_KERNEL(
     sqrt_sr, CPU, ALL_LAYOUT, phi::sr::SqrtKernel, float, double) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 PD_REGISTER_KERNEL(square_sr,
                    GPU,
diff --git a/paddle/phi/kernels/selected_rows/assign_kernel.cc b/paddle/phi/kernels/selected_rows/assign_kernel.cc
index 081d85e68c959..481f5f6fcf852 100644
--- a/paddle/phi/kernels/selected_rows/assign_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/assign_kernel.cc
@@ -41,7 +41,7 @@ PD_REGISTER_KERNEL_FOR_ALL_DTYPE(assign_sr,
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL_FOR_ALL_DTYPE(assign_sr,
                                  GPU,
                                  ALL_LAYOUT,
diff --git a/paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.cc b/paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.cc
index dccbba6947a1b..0ea7fbe8857c4 100644
--- a/paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.cc
@@ -84,7 +84,7 @@ PD_REGISTER_KERNEL(multiply_sr,
                    complex64,
                    complex128) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(multiply_raw_sr,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/selected_rows/full_kernel.cc b/paddle/phi/kernels/selected_rows/full_kernel.cc
index e04139448dddc..b593e6db3f936 100644
--- a/paddle/phi/kernels/selected_rows/full_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/full_kernel.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/selected_rows/full_kernel.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #endif
 #include "paddle/phi/common/bfloat16.h"
@@ -54,7 +54,7 @@ PD_REGISTER_KERNEL(full_sr,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(full_sr,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/selected_rows/isfinite_kernel.cc b/paddle/phi/kernels/selected_rows/isfinite_kernel.cc
index d68688a7e400a..e3489f50e2184 100644
--- a/paddle/phi/kernels/selected_rows/isfinite_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/isfinite_kernel.cc
@@ -15,7 +15,7 @@
 #include "paddle/phi/kernels/selected_rows/isfinite_kernel.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #endif
 #include "paddle/phi/core/kernel_registry.h"
@@ -51,7 +51,7 @@ PD_REGISTER_KERNEL(isfinite_sr,
                    int,
                    int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(isinf_sr,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/selected_rows/merge_selected_rows_kernel.cc b/paddle/phi/kernels/selected_rows/merge_selected_rows_kernel.cc
index a5d2e66787316..7b6f7e9ceefa4 100644
--- a/paddle/phi/kernels/selected_rows/merge_selected_rows_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/merge_selected_rows_kernel.cc
@@ -41,7 +41,7 @@ PD_REGISTER_KERNEL(merge_selected_rows,
                    float,
                    double) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(merge_selected_rows,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/selected_rows/scale_kernel.cc b/paddle/phi/kernels/selected_rows/scale_kernel.cc
index 38a0cb75101b7..f6f9d587c4022 100644
--- a/paddle/phi/kernels/selected_rows/scale_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/scale_kernel.cc
@@ -54,7 +54,7 @@ PD_REGISTER_KERNEL(scale_sr,
                    int,
                    int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(scale_sr,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/selected_rows/shape_kernel.cc b/paddle/phi/kernels/selected_rows/shape_kernel.cc
index f44a6a8dfafc5..0a07bee7b6974 100644
--- a/paddle/phi/kernels/selected_rows/shape_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/shape_kernel.cc
@@ -52,7 +52,7 @@ PD_REGISTER_KERNEL(shape_sr,
   kernel->OutputAt(0).SetDataType(phi::DataType::INT32);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(shape_sr,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/selected_rows/uniform_kernel.cc b/paddle/phi/kernels/selected_rows/uniform_kernel.cc
index 0af5d8788c71f..90bee1744e962 100644
--- a/paddle/phi/kernels/selected_rows/uniform_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/uniform_kernel.cc
@@ -77,7 +77,7 @@ PD_REGISTER_KERNEL(uniform_sr,
                    double,
                    phi::dtype::bfloat16) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 PD_REGISTER_KERNEL(uniform_raw_sr,
                    GPU,
diff --git a/paddle/phi/kernels/shape_kernel.cc b/paddle/phi/kernels/shape_kernel.cc
index c4190a5f59b62..e7556d1401954 100644
--- a/paddle/phi/kernels/shape_kernel.cc
+++ b/paddle/phi/kernels/shape_kernel.cc
@@ -51,7 +51,7 @@ PD_REGISTER_KERNEL(shape,
   kernel->OutputAt(0).SetDataType(phi::DataType::INT32);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(shape,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/sparse/empty_kernel.cc b/paddle/phi/kernels/sparse/empty_kernel.cc
index 49a377ca70f67..44ccdd3bda634 100644
--- a/paddle/phi/kernels/sparse/empty_kernel.cc
+++ b/paddle/phi/kernels/sparse/empty_kernel.cc
@@ -82,7 +82,7 @@ PD_REGISTER_KERNEL(empty_like_csr,
   kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(empty_like_coo,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc b/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc
index 064867610d719..8e9ed654760f3 100644
--- a/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc
@@ -81,7 +81,7 @@ PD_REGISTER_KERNEL(sparse_coo_tensor_grad,
   kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_COO);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(values_coo_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/squeeze_grad_kernel.cc b/paddle/phi/kernels/squeeze_grad_kernel.cc
index 473acf9d7a1d1..3eab4daf5740a 100644
--- a/paddle/phi/kernels/squeeze_grad_kernel.cc
+++ b/paddle/phi/kernels/squeeze_grad_kernel.cc
@@ -49,7 +49,7 @@ PD_REGISTER_KERNEL(squeeze_grad,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(squeeze_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/squeeze_kernel.cc b/paddle/phi/kernels/squeeze_kernel.cc
index d495b040921b5..933540cd787e4 100644
--- a/paddle/phi/kernels/squeeze_kernel.cc
+++ b/paddle/phi/kernels/squeeze_kernel.cc
@@ -74,7 +74,7 @@ PD_REGISTER_KERNEL(squeeze,
                    int64_t,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(squeeze_infer,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/strided_slice_grad_kernel.cc b/paddle/phi/kernels/strided_slice_grad_kernel.cc
index 7582f751bf16a..dd5bd42a3f48a 100644
--- a/paddle/phi/kernels/strided_slice_grad_kernel.cc
+++ b/paddle/phi/kernels/strided_slice_grad_kernel.cc
@@ -55,7 +55,7 @@ PD_REGISTER_KERNEL(strided_slice_grad,
                    phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(strided_slice_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/strided_slice_kernel.cc b/paddle/phi/kernels/strided_slice_kernel.cc
index 68377dbe8468e..79e43de25e9a8 100644
--- a/paddle/phi/kernels/strided_slice_kernel.cc
+++ b/paddle/phi/kernels/strided_slice_kernel.cc
@@ -46,7 +46,7 @@ PD_REGISTER_KERNEL(strided_slice,
                    phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(strided_slice,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/strings/gpu/copy_utils.h b/paddle/phi/kernels/strings/gpu/copy_utils.h
index 36cad02618424..a6c2aba97b5e8 100644
--- a/paddle/phi/kernels/strings/gpu/copy_utils.h
+++ b/paddle/phi/kernels/strings/gpu/copy_utils.h
@@ -23,7 +23,7 @@ limitations under the License. */
 namespace phi {
 namespace strings {
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 __global__ void SerializeStringsData(const phi::dtype::pstring* src_str,
                                      uint8_t* strings_data,
                                      int32_t* strings_offset,
@@ -146,7 +146,7 @@ void DeserializeOnCPU(const Context& dev_ctx,
   }
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 void SerializeOnGPU(const phi::GPUContext& dev_ctx,
                     const StringTensor& src,
                     DenseTensor* dst) {
diff --git a/paddle/phi/kernels/strings/strings_empty_kernel.cc b/paddle/phi/kernels/strings/strings_empty_kernel.cc
index 22a43ceaff1c1..60a75584587d3 100644
--- a/paddle/phi/kernels/strings/strings_empty_kernel.cc
+++ b/paddle/phi/kernels/strings/strings_empty_kernel.cc
@@ -49,7 +49,7 @@ PD_REGISTER_KERNEL_FOR_ALL_DTYPE(
     ALL_LAYOUT,
     phi::strings::EmptyLikeKernel<phi::CPUContext>) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL_FOR_ALL_DTYPE(strings_empty,
                                  GPU,
                                  ALL_LAYOUT,
diff --git a/paddle/phi/kernels/strings/unicode.cc b/paddle/phi/kernels/strings/unicode.cc
index 9f636809de876..75e48f1ce982e 100644
--- a/paddle/phi/kernels/strings/unicode.cc
+++ b/paddle/phi/kernels/strings/unicode.cc
@@ -46,7 +46,7 @@ const uint16_t* GetCharcasesMap() {
   return reinterpret_cast<const uint16_t*>(utils_map[0]);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 const uint8_t* GetGPUUniflagMap() {
   if (utils_map[3] == nullptr) {
diff --git a/paddle/phi/kernels/strings/unicode.h b/paddle/phi/kernels/strings/unicode.h
index 45e41b72d086c..89ec9efa15189 100644
--- a/paddle/phi/kernels/strings/unicode.h
+++ b/paddle/phi/kernels/strings/unicode.h
@@ -188,7 +188,7 @@ HOSTDEVICE inline void GetUTF8Str(const uint32_t* unicode_str,
 const uint8_t* GetUniFlagMap();
 const uint16_t* GetCharcasesMap();
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 const uint8_t* GetGPUUniflagMap();
 const uint16_t* GetGPUCharcasesMap();
diff --git a/paddle/phi/kernels/transfer_layout_kernel.cc b/paddle/phi/kernels/transfer_layout_kernel.cc
index 84b978436e163..5ee69e5964918 100644
--- a/paddle/phi/kernels/transfer_layout_kernel.cc
+++ b/paddle/phi/kernels/transfer_layout_kernel.cc
@@ -71,7 +71,7 @@ void TransferLayoutGeneral(const Context& dev_ctx,
 
   out->Resize(phi::make_ddim(dst_dim));
   dev_ctx.Alloc(out, x.dtype());
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   // In GPU fp16 model, we will insert many transfer_layout ops in
   // conv2d_fusion_layout_transfer_pass, so we optimize this kernel on GPU
   if (std::is_same<Context, phi::GPUContext>::value) {
@@ -221,7 +221,7 @@ PD_REGISTER_KERNEL_FOR_ALL_DTYPE(transfer_layout,
                                  CPU,
                                  ALL_LAYOUT,
                                  phi::TransferLayoutKernel<phi::CPUContext>) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL_FOR_ALL_DTYPE(transfer_layout,
                                  GPU,
                                  ALL_LAYOUT,
diff --git a/paddle/phi/kernels/unsqueeze_grad_kernel.cc b/paddle/phi/kernels/unsqueeze_grad_kernel.cc
index 3c119db2c73d6..e294c3a983769 100644
--- a/paddle/phi/kernels/unsqueeze_grad_kernel.cc
+++ b/paddle/phi/kernels/unsqueeze_grad_kernel.cc
@@ -49,7 +49,7 @@ PD_REGISTER_KERNEL(unsqueeze_grad,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(unsqueeze_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/unsqueeze_kernel.cc b/paddle/phi/kernels/unsqueeze_kernel.cc
index c08c31da4ef0c..6e03176857e4c 100644
--- a/paddle/phi/kernels/unsqueeze_kernel.cc
+++ b/paddle/phi/kernels/unsqueeze_kernel.cc
@@ -80,7 +80,7 @@ PD_REGISTER_KERNEL(unsqueeze,
                    int64_t,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(unsqueeze_infer,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
index 7295e86182734..c3ca58e0e4a94 100644
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/init.h"
 #include "paddle/phi/core/flags.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 DECLARE_bool(enable_gpu_memory_usage_log);
 #endif
 
@@ -84,7 +84,7 @@ int main(int argc, char** argv) {
     VLOG(1) << "gtest undefok_string:" << undefok_string;
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (strstr(undefok_str, "enable_gpu_memory_usage_log")) {
     VLOG(1) << "Set FLAGS_enable_gpu_memory_usage_log to true";
     FLAGS_enable_gpu_memory_usage_log = true;

From a705c264bf5c5b213e3def54afd047df29f6e889 Mon Sep 17 00:00:00 2001
From: HanHaowen <haowen.han@mthreads.com>
Date: Tue, 25 Jul 2023 16:20:25 +0800
Subject: [PATCH 02/55] add musa macro in phi folder except phi/kernels folder

---
 paddle/phi/api/include/tensor.h               |   5 +
 paddle/phi/api/profiler/event.h               |  23 ++++
 paddle/phi/backends/device_code.cc            | 125 ++++++++++++++++++
 paddle/phi/backends/device_code.h             |   9 ++
 paddle/phi/backends/dynload/dynamic_loader.cc |   5 +
 paddle/phi/backends/gpu/gpu_context.cc        |  59 +++++++++
 paddle/phi/backends/gpu/gpu_decls.h           |   4 +
 paddle/phi/backends/gpu/gpu_device_function.h |   2 +
 paddle/phi/backends/gpu/gpu_dnn.h             |   3 +
 paddle/phi/backends/gpu/gpu_helper.h          |   2 +
 paddle/phi/backends/gpu/gpu_primitives.h      |   3 +
 paddle/phi/backends/gpu/gpu_resources.cc      |  88 +++++++++++-
 paddle/phi/backends/gpu/gpu_types.h           |  10 +-
 .../backends/gpu/rocm/rocm_device_function.h  |   2 +
 paddle/phi/common/bfloat16.h                  |  17 +++
 paddle/phi/common/complex.h                   |  14 ++
 paddle/phi/common/float16.h                   |   4 +
 paddle/phi/core/cuda_stream.h                 |   9 ++
 paddle/phi/core/enforce.h                     |  24 ++++
 paddle/phi/core/string_tensor.cc              |   2 +
 20 files changed, 408 insertions(+), 2 deletions(-)

diff --git a/paddle/phi/api/include/tensor.h b/paddle/phi/api/include/tensor.h
index 4224aeae2b5c3..b2c687a1f448d 100644
--- a/paddle/phi/api/include/tensor.h
+++ b/paddle/phi/api/include/tensor.h
@@ -29,6 +29,11 @@ using gpuStream_t = cudaStream_t;
 using gpuStream_t = hipStream_t;
 #endif
 
+#ifdef PADDLE_WITH_MUSA
+#include <musa_runtime.h>
+using gpuStream_t = musaStream_t;
+#endif
+
 #include "paddle/phi/api/include/dll_decl.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/int_array.h"
diff --git a/paddle/phi/api/profiler/event.h b/paddle/phi/api/profiler/event.h
index ebd613e4a8099..3a789cad101f4 100644
--- a/paddle/phi/api/profiler/event.h
+++ b/paddle/phi/api/profiler/event.h
@@ -27,6 +27,9 @@ limitations under the License. */
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
+#ifdef PADDLE_WITH_MUSA
+#include <musa_runtime.h>
+#endif
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/phi/core/cuda_stream.h"
@@ -143,6 +146,8 @@ class CudaEvent {
   CudaEvent() {
 #ifdef PADDLE_WITH_HIP
     hipEventCreateWithFlags(&event_, flags_);
+#elif defined(PADDLE_WITH_MUSA)
+    musaEventCreateWithFlags(&event_, flags_);
 #else
     cudaEventCreateWithFlags(&event_, flags_);
 #endif
@@ -152,6 +157,8 @@ class CudaEvent {
   explicit CudaEvent(unsigned int flags) : flags_(flags) {
 #ifdef PADDLE_WITH_HIP
     hipEventCreateWithFlags(&event_, flags_);
+#elif defined(PADDLE_WITH_MUSA)
+    musaEventCreateWithFlags(&event_, flags_);
 #else
     cudaEventCreateWithFlags(&event_, flags_);
 #endif
@@ -161,6 +168,8 @@ class CudaEvent {
   ~CudaEvent() {
 #ifdef PADDLE_WITH_HIP
     hipEventDestroy(event_);
+#elif defined(PADDLE_WITH_MUSA)
+    musaEventDestroy(event_);
 #else
     cudaEventDestroy(event_);
 #endif
@@ -169,6 +178,8 @@ class CudaEvent {
   void Record(gpuStream_t stream) {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, stream));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event_, stream));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, stream));
 #endif
@@ -183,6 +194,14 @@ class CudaEvent {
     if (err == hipErrorNotReady) {
       return false;
     }
+#elif defined(PADDLE_WITH_MUSA)
+    gpuError_t err = musaEventQuery(event_);
+    if (err == musaSuccess) {
+      return true;
+    }
+    if (err == musaErrorNotReady) {
+      return false;
+    }
 #else
     gpuError_t err = cudaEventQuery(event_);
     if (err == cudaSuccess) {
@@ -199,6 +218,8 @@ class CudaEvent {
   void Synchronize() {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventSynchronize(event_));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaEventSynchronize(event_));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventSynchronize(event_));
 #endif
@@ -208,6 +229,8 @@ class CudaEvent {
  private:
 #ifdef PADDLE_WITH_HIP
   unsigned int flags_ = hipEventDefault;
+#elif defined(PADDLE_WITH_MUSA)
+  unsigned int flags_ = musaEventDefault;
 #else
   unsigned int flags_ = cudaEventDefault;
 #endif
diff --git a/paddle/phi/backends/device_code.cc b/paddle/phi/backends/device_code.cc
index 27cdf09236d35..529e42fc4c95b 100644
--- a/paddle/phi/backends/device_code.cc
+++ b/paddle/phi/backends/device_code.cc
@@ -101,6 +101,13 @@ static bool CheckCUDADriverResult(hipError_t result,
   if (result != hipSuccess) {
     const char* error = nullptr;
     error = dynload::hipGetErrorString(result);
+#elif defined(PADDLE_WITH_MUSA)
+static bool CheckCUDADriverResult(MUresult result,
+                                  std::string caller,
+                                  std::string kernel_name = "") {
+  if (result != MUSA_SUCCESS) {
+    const char* error = nullptr;
+    dynload::muGetErrorString(result, &error);
 #else
 static bool CheckCUDADriverResult(CUresult result,
                                   std::string caller,
@@ -130,6 +137,8 @@ void GPUDeviceCode::CheckAvailableStatus() {
 #ifdef PADDLE_WITH_HIP
   hiprtcResult nvrtc_result =
       dynload::hiprtcVersion(&nvrtc_major, &nvrtc_minor);
+#elif defined(PADDLE_WITH_MUSA)
+  nvrtcResult nvrtc_result = dynload::nvrtcVersion(&nvrtc_major, &nvrtc_minor);
 #else
   nvrtcResult nvrtc_result = dynload::nvrtcVersion(&nvrtc_major, &nvrtc_minor);
 #endif
@@ -140,6 +149,9 @@ void GPUDeviceCode::CheckAvailableStatus() {
 #ifdef PADDLE_WITH_HIP
   hipError_t driver_result = dynload::hipDriverGetVersion(&driver_version);
   if (driver_result == hipSuccess) {
+#elif defined(PADDLE_WITH_MUSA)
+  MUresult driver_result = dynload::muDriverGetVersion(&driver_version);
+  if (driver_result == MUSA_SUCCESS) {
 #else
   CUresult driver_result = dynload::cuDriverGetVersion(&driver_version);
   if (driver_result == CUDA_SUCCESS) {
@@ -153,6 +165,8 @@ void GPUDeviceCode::CheckAvailableStatus() {
                        << "." << nvrtc_minor;
 #ifdef PADDLE_WITH_HIP
   if (nvrtc_result != HIPRTC_SUCCESS || driver_result != hipSuccess) {
+#elif defined(PADDLE_WITH_MUSA)
+  if (nvrtc_result != NVRTC_SUCCESS || driver_result != MUSA_SUCCESS) {
 #else
   if (nvrtc_result != NVRTC_SUCCESS || driver_result != CUDA_SUCCESS) {
 #endif
@@ -163,6 +177,9 @@ void GPUDeviceCode::CheckAvailableStatus() {
 #ifdef PADDLE_WITH_HIP
   if (CheckCUDADriverResult(dynload::hipGetDeviceCount(&count),
                             "hipGetDeviceCount")) {
+#elif defined(PADDLE_WITH_MUSA)
+  if (CheckCUDADriverResult(dynload::muDeviceGetCount(&count),
+                            "muDeviceGetCount")) {
 #else
   if (CheckCUDADriverResult(dynload::cuDeviceGetCount(&count),
                             "cuDeviceGetCount")) {
@@ -202,6 +219,8 @@ static std::string FindCUDAIncludePath() {
 
 #ifdef PADDLE_WITH_HIP
   cuda_include_path = "/opt/rocm/include";
+#elif defined(PADDLE_WITH_MUSA)
+  cuda_include_path = "/usr/local/musa/include";
 #else
   cuda_include_path = "/usr/local/cuda/include";
 #endif
@@ -229,6 +248,8 @@ GPUDeviceCode::GPUDeviceCode(const Place& place,
   name_ = name;
 #ifdef PADDLE_WITH_HIP
   kernel_ = "#include <hip/hip_runtime.h>\n" + kernel;
+#elif defined(PADDLE_WITH_MUSA)
+  kernel_ = kernel;
 #else
   kernel_ = kernel;
 #endif
@@ -318,6 +339,86 @@ bool GPUDeviceCode::Compile(bool include_path) {
           "hipModuleGetFunction")) {
     return false;
   }
+#elif defined(PADDLE_WITH_MUSA)
+  nvrtcProgram program;
+  if (!CheckNVRTCResult(dynload::nvrtcCreateProgram(&program,
+                                                    kernel_.c_str(),  // buffer
+                                                    name_.c_str(),    // name
+                                                    0,         // numHeaders
+                                                    nullptr,   // headers
+                                                    nullptr),  // includeNames
+                        "nvrtcCreateProgram")) {
+    return false;
+  }
+
+  // Compile the program for specified compute_capability
+  auto* dev_ctx = reinterpret_cast<phi::GPUContext*>(
+      DeviceContextPool::Instance().Get(place_));
+  int compute_capability = dev_ctx->GetComputeCapability();
+  std::string compute_flag =
+      "--gpu-architecture=compute_" + std::to_string(compute_capability);
+  std::vector<const char*> options = {"--std=c++11", compute_flag.c_str()};
+  std::string include_option;
+  if (include_path) {
+    std::string cuda_include_path = FindMUSAIncludePath();
+    if (!cuda_include_path.empty()) {
+      include_option = "--include-path=" + cuda_include_path;
+      options.push_back(include_option.c_str());
+    }
+  }
+  nvrtcResult compile_result =
+      dynload::nvrtcCompileProgram(program,          // program
+                                   options.size(),   // numOptions
+                                   options.data());  // options
+  if (compile_result == NVRTC_ERROR_COMPILATION) {
+    // Obtain compilation log from the program
+    size_t log_size;
+    if (!CheckNVRTCResult(dynload::nvrtcGetProgramLogSize(program, &log_size),
+                          "nvrtcGetProgramLogSize")) {
+      return false;
+    }
+    std::vector<char> log;
+    log.resize(log_size + 1);
+    if (!CheckNVRTCResult(dynload::nvrtcGetProgramLog(program, log.data()),
+                          "nvrtcGetProgramLog")) {
+      return false;
+    }
+    LOG(WARNING) << "JIT compiling of CUDA code failed:"
+                 << "\n  Kernel name: " << name_ << "\n  Kernel body:\n"
+                 << kernel_ << "\n  Compiling log: " << log.data();
+
+    return false;
+  }
+
+  // Obtain PTX from the program
+  size_t ptx_size;
+  if (!CheckNVRTCResult(dynload::nvrtcGetPTXSize(program, &ptx_size),
+                        "nvrtcGetPTXSize")) {
+    return false;
+  }
+  ptx_.resize(ptx_size + 1);
+  if (!CheckNVRTCResult(dynload::nvrtcGetPTX(program, ptx_.data()),
+                        "nvrtcGetPTX")) {
+    return false;
+  }
+
+  if (!CheckNVRTCResult(dynload::nvrtcDestroyProgram(&program),
+                        "nvrtcDestroyProgram")) {
+    return false;
+  }
+
+  if (!CheckCUDADriverResult(dynload::muModuleLoadData(&module_, ptx_.data()),
+                             "muModuleLoadData",
+                             name_)) {
+    return false;
+  }
+
+  if (!CheckCUDADriverResult(
+          dynload::muModuleGetFunction(&function_, module_, name_.c_str()),
+          "muModuleGetFunction",
+          name_)) {
+    return false;
+  }
 #else
   nvrtcProgram program;
   if (!CheckNVRTCResult(dynload::nvrtcCreateProgram(&program,
@@ -436,6 +537,22 @@ void GPUDeviceCode::Launch(const size_t n, std::vector<void*>* args) const {
       hipSuccess,
       errors::External("Fail to launch kernel %s (in hipModuleLaunchKernel.)",
                        name_.c_str()));
+#elif defined(PADDLE_WITH_MUSA)
+  PADDLE_ENFORCE_EQ(
+      dynload::muLaunchKernel(function_,
+                              num_blocks,
+                              1,
+                              1,  // grid dim
+                              num_threads_,
+                              1,
+                              1,                  // block dim
+                              0,                  // shared memory
+                              dev_ctx->stream(),  // stream
+                              args->data(),       // arguments
+                              nullptr),
+      MUSA_SUCCESS,
+      errors::External("Fail to launch kernel %s (in muLaunchKernel.)",
+                       name_.c_str()));
 #else
   PADDLE_ENFORCE_EQ(
       dynload::cuLaunchKernel(function_,
@@ -464,6 +581,14 @@ bool GPUDeviceCode::CheckNVRTCResult(hiprtcResult result,
         << " > failed: " << dynload::hiprtcGetErrorString(result);
     return false;
   }
+#elif defined(PADDLE_WITH_MUSA)
+bool GPUDeviceCode::CheckNVRTCResult(nvrtcResult result, std::string function) {
+  if (result != NVRTC_SUCCESS) {
+    LOG_FIRST_N(WARNING, 1)
+        << "Call " << function << " for < " << name_
+        << " > failed: " << dynload::nvrtcGetErrorString(result);
+    return false;
+  }
 #else
 bool GPUDeviceCode::CheckNVRTCResult(nvrtcResult result, std::string function) {
   if (result != NVRTC_SUCCESS) {
diff --git a/paddle/phi/backends/device_code.h b/paddle/phi/backends/device_code.h
index 64b89b83b42ed..63d221ea8c89a 100644
--- a/paddle/phi/backends/device_code.h
+++ b/paddle/phi/backends/device_code.h
@@ -26,6 +26,10 @@ limitations under the License. */
 #include "paddle/phi/backends/dynload/cuda_driver.h"
 #include "paddle/phi/backends/dynload/nvrtc.h"
 #endif
+#ifdef PADDLE_WITH_MUSA
+#include "paddle/phi/backends/dynload/musa_driver.h"
+#include "paddle/phi/backends/dynload/nvrtc.h"
+#endif
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/backends/dynload/hiprtc.h"
 #include "paddle/phi/backends/dynload/rocm_driver.h"
@@ -68,6 +72,8 @@ class GPUDeviceCode : public DeviceCode {
  private:
 #ifdef PADDLE_WITH_HIP
   bool CheckNVRTCResult(hiprtcResult result, std::string function);
+#elif defined(PADDLE_WITH_MUSA)
+  bool CheckNVRTCResult(cudartcResult result, std::string function);
 #else
   bool CheckNVRTCResult(nvrtcResult result, std::string function);
 #endif
@@ -82,6 +88,9 @@ class GPUDeviceCode : public DeviceCode {
 #ifdef PADDLE_WITH_HIP
   hipModule_t module_;
   hipFunction_t function_;
+#elif defined(PADDLE_WITH_MUSA)
+  MUmodule module_;
+  MUfunction function_;
 #else
   CUmodule module_;
   CUfunction function_;
diff --git a/paddle/phi/backends/dynload/dynamic_loader.cc b/paddle/phi/backends/dynload/dynamic_loader.cc
index 354ff5b7dc855..fd6d3ef9e0097 100644
--- a/paddle/phi/backends/dynload/dynamic_loader.cc
+++ b/paddle/phi/backends/dynload/dynamic_loader.cc
@@ -506,6 +506,11 @@ void* GetNCCLDsoHandle() {
       "You may need to install 'rccl' from ROCM official website: "
       "https://rocmdocs.amd.com/en/latest/Installation_Guide/"
       "Installation-Guide.html before install PaddlePaddle.");
+#elif defined(PADDLE_WITH_MUSA)
+  std::string warning_msg(
+      "You may need to install 'mccl' from MUSA official website: "
+      "https://rocmdocs.amd.com/en/latest/Installation_Guide/"
+      "Installation-Guide.html before install PaddlePaddle.");
 #else
   std::string warning_msg(
       "You may need to install 'nccl2' from NVIDIA official website: "
diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc
index f10ec7019b7b6..9291f3d00d8f3 100644
--- a/paddle/phi/backends/gpu/gpu_context.cc
+++ b/paddle/phi/backends/gpu/gpu_context.cc
@@ -43,6 +43,17 @@ limitations under the License. */
 #endif  // !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
 #endif  // PADDLE_WITH_CUDA
 
+#ifdef PADDLE_WITH_MUSA
+#include "paddle/phi/backends/dynload/mublas.h"
+#include "paddle/phi/backends/dynload/mudnn.h"
+#include "paddle/phi/backends/dynload/musolver.h"
+#include "paddle/phi/backends/dynload/musparse.h"
+#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
+#include "paddle/phi/backends/dynload/mccl.h"
+#endif  // !defined(__APPLE__) && defined(PADDLE_WITH_MCCL)
+#endif  // PADDLE_WITH_MUSA
+
+
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/backends/dynload/miopen.h"
 #include "paddle/phi/backends/dynload/rocblas.h"
@@ -119,6 +130,9 @@ class EigenGpuStreamDevice : public Eigen::StreamInterface {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream()));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          musaMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream()));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream()));
@@ -143,6 +157,16 @@ static void StreamCallbackFunc(gpuStream_t stream,
                                gpuError_t status,
                                void* user_data)
 #endif
+
+#ifdef PADDLE_WITH_MUSA
+#if MUSA_VERSION >= 10000
+    static void MUDART_CB StreamCallbackFunc(void* user_data)
+#else
+    static void MUDART_CB
+    StreamCallbackFunc(musaStream_t stream, musaError_t status, void* user_data)
+#endif
+#endif
+
 #ifdef PADDLE_WITH_CUDA
 #if CUDA_VERSION >= 10000
     static void CUDART_CB StreamCallbackFunc(void* user_data)
@@ -170,6 +194,8 @@ void DnnWorkspaceHandle::RunFuncSync(
     std::lock_guard<std::mutex> guard(*mtx_);
 #ifdef PADDLE_WITH_HIP
     auto status = hipMalloc(&workspace_ptr, size);
+#elif defined(PADDLE_WITH_MUSA)
+    auto status = musaMalloc(&workspace_ptr, size);
 #else
     auto status = cudaMalloc(&workspace_ptr, size);
 #endif
@@ -178,6 +204,8 @@ void DnnWorkspaceHandle::RunFuncSync(
       phi::backends::gpu::GpuStreamSync(stream_);
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(hipFree(workspace_ptr));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(musaFree(workspace_ptr));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaFree(workspace_ptr));
 #endif
@@ -464,6 +492,11 @@ struct GPUContext::Impl {
       PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDestroy(dnn_handle_));
       dnn_handle_ = nullptr;
     }
+#elif defined(PADDLE_WITH_MUSA)
+    if (owned_ && dnn_handle_ != nullptr) {
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnDestroy(dnn_handle_));
+      dnn_handle_ = nullptr;
+    }
 #else
     if (owned_ && dnn_handle_ != nullptr) {
       PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDestroy(dnn_handle_));
@@ -529,6 +562,18 @@ struct GPUContext::Impl {
       break;
     }
 #endif  // !defined(_WIN32)
+
+#elif defined(PADDLE_WITH_MUSA)
+    musaError_t e_sync = musaSuccess;
+#if !defined(_WIN32)
+    e_sync = musaStreamSynchronize(stream());
+#else
+    while (e_sync = musaStreamQuery(stream())) {
+      if (e_sync == musaErrorNotReady) continue;
+      break;
+    }
+#endif  // !defined(_WIN32)
+
 #else   // PADDLE_WITH_HIP
     cudaError_t e_sync = cudaSuccess;
 #if !defined(_WIN32)
@@ -547,6 +592,8 @@ struct GPUContext::Impl {
   void WaitEvent(gpuEvent_t ev) const {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(stream(), ev, 0));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(stream(), ev, 0));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(stream(), ev, 0));
 #endif
@@ -678,6 +725,8 @@ struct GPUContext::Impl {
   void RecordEvent(gpuEvent_t ev) const {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(ev, stream()));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(ev, stream()));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(ev, stream()));
 #endif
@@ -708,6 +757,16 @@ struct GPUContext::Impl {
     PADDLE_ENFORCE_GPU_SUCCESS(
         cudaStreamAddCallback(stream(), internal::StreamCallbackFunc, func, 0));
 #endif
+#endif
+
+#ifdef PADDLE_WITH_MUSA
+#if MUSA_VERSION >= 10000
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        musaLaunchHostFunc(stream(), internal::StreamCallbackFunc, func));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        musaStreamAddCallback(stream(), internal::StreamCallbackFunc, func, 0));
+#endif
 #endif
   }
 
diff --git a/paddle/phi/backends/gpu/gpu_decls.h b/paddle/phi/backends/gpu/gpu_decls.h
index 4a6b9d2fd87f1..93dba9764478a 100644
--- a/paddle/phi/backends/gpu/gpu_decls.h
+++ b/paddle/phi/backends/gpu/gpu_decls.h
@@ -23,6 +23,10 @@ namespace phi {
 #define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
   using GPU_TYPE = ROCM_TYPE;
 
+#elif defined(PADDLE_WITH_MUSA)
+
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
+  using GPU_TYPE = MUSA_TYPE;
 #else  // PADDLE_WITH_CDUA
 
 #define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
diff --git a/paddle/phi/backends/gpu/gpu_device_function.h b/paddle/phi/backends/gpu/gpu_device_function.h
index de4565cb6e7ce..5c0c475b140ff 100644
--- a/paddle/phi/backends/gpu/gpu_device_function.h
+++ b/paddle/phi/backends/gpu/gpu_device_function.h
@@ -17,6 +17,8 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/backends/gpu/rocm/rocm_device_function.h"
+#elif defined(PADDLE_WITH_MUSA)
+#include "paddle/phi/backends/gpu/musa/musa_device_function.h"
 #else
 #include "paddle/phi/backends/gpu/cuda/cuda_device_function.h"
 #endif
diff --git a/paddle/phi/backends/gpu/gpu_dnn.h b/paddle/phi/backends/gpu/gpu_dnn.h
index 44163d8048f2c..30cf3fae80519 100644
--- a/paddle/phi/backends/gpu/gpu_dnn.h
+++ b/paddle/phi/backends/gpu/gpu_dnn.h
@@ -19,6 +19,9 @@
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/backends/gpu/rocm/miopen_desc.h"
 #include "paddle/phi/backends/gpu/rocm/miopen_helper.h"
+#elif defined(PADDLE_WITH_MUSA)
+#include "paddle/phi/backends/gpu/musa/mudnn_desc.h"
+#include "paddle/phi/backends/gpu/musa/mudnn_helper.h"
 #else  // CUDA
 #include "paddle/phi/backends/gpu/cuda/cudnn_desc.h"
 #include "paddle/phi/backends/gpu/cuda/cudnn_helper.h"
diff --git a/paddle/phi/backends/gpu/gpu_helper.h b/paddle/phi/backends/gpu/gpu_helper.h
index 428c5dcb96c6a..8afa826408cb7 100644
--- a/paddle/phi/backends/gpu/gpu_helper.h
+++ b/paddle/phi/backends/gpu/gpu_helper.h
@@ -17,6 +17,8 @@
 
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/backends/gpu/rocm/rocm_helper.h"
+#elif defined(PADDLE_WITH_MUSA)
+#include "paddle/phi/backends/gpu/musa/musa_helper.h"
 #else
 #include "paddle/phi/backends/gpu/cuda/cuda_helper.h"
 #endif
diff --git a/paddle/phi/backends/gpu/gpu_primitives.h b/paddle/phi/backends/gpu/gpu_primitives.h
index a77527c081650..b7c9f9c4688dc 100644
--- a/paddle/phi/backends/gpu/gpu_primitives.h
+++ b/paddle/phi/backends/gpu/gpu_primitives.h
@@ -16,6 +16,9 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
 #endif
+#ifdef PADDLE_WITH_MUSA
+#include <musa.h>
+#endif
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
diff --git a/paddle/phi/backends/gpu/gpu_resources.cc b/paddle/phi/backends/gpu/gpu_resources.cc
index a447df94cb4dc..b60d0cccd3dc5 100644
--- a/paddle/phi/backends/gpu/gpu_resources.cc
+++ b/paddle/phi/backends/gpu/gpu_resources.cc
@@ -33,6 +33,19 @@
 #endif  // !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
 #endif  // PADDLE_WITH_CUDA
 
+
+
+#ifdef PADDLE_WITH_MUSA
+#include "paddle/phi/backends/dynload/mublas.h"
+#include "paddle/phi/backends/dynload/mublasLt.h"
+#include "paddle/phi/backends/dynload/mudnn.h"
+#include "paddle/phi/backends/dynload/musolver.h"
+#include "paddle/phi/backends/dynload/musparse.h"
+#if !defined(__APPLE__) && defined(PADDLE_WITH_MCCL)
+#include "paddle/phi/backends/dynload/mccl.h"
+#endif  // !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
+#endif  // PADDLE_WITH_MUSA
+
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/backends/dynload/rocsparse.h"
 #endif
@@ -144,6 +157,44 @@ void InitGpuProperties(Place place,
         << "Please recompile or reinstall Paddle with compatible MIOPEN "
            "version.";
   }
+#elif defined(PADDLE_WITH_MUSA)
+  size_t mudnn_dso_ver = dynload::mudnnGetVersion();
+  LOG_FIRST_N(WARNING, 1) << "device: " << static_cast<int>(place.device)
+                          << ", muDNN Version: " << mudnn_dso_ver / 1000 << "."
+                          << (mudnn_dso_ver % 1000) / 100 << ".";
+
+  // Check MUSA/MUDNN version compatiblity
+  auto local_musa_version =
+      (*driver_version / 1000) * 10 + (*driver_version % 100) / 10;
+  auto compile_musa_version =
+      (MUSA_VERSION / 1000) * 10 + (MUSA_VERSION % 100) / 10;
+#if defined(__linux__)
+  PADDLE_ENFORCE_EQ(
+      (local_musa_version / 10 < compile_musa_version / 10) &&
+          (mudnn_dso_ver / 1000 < MUDNN_VERSION / 1000),
+      false,
+      phi::errors::InvalidArgument(
+          "The installed Paddle is compiled with MUDA%d/muDNN%d,"
+          "but MUSA/muDNN version in your machine is MUSA%d/muDNN%d. "
+          "which will cause serious incompatible bug. "
+          "Please recompile or reinstall Paddle with compatible MUSA/muDNN "
+          "version.",
+          compile_musa_version / 10,
+          MUDNN_VERSION / 1000,
+          local_musa_version / 10,
+          mudnn_dso_ver / 1000));
+#endif
+  if (local_cuda_version < compile_cuda_version) {
+    LOG_FIRST_N(WARNING, 1)
+        << "WARNING: device: " << static_cast<int>(place.device)
+        << ". The installed Paddle is compiled with CUDA "
+        << compile_cuda_version / 10 << "." << compile_cuda_version % 10
+        << ", but CUDA runtime version in your machine is "
+        << local_cuda_version / 10 << "." << local_cuda_version % 10
+        << ", which may cause serious incompatible bug. "
+        << "Please recompile or reinstall Paddle with compatible CUDA "
+           "version.";
+  }
 #else
   size_t cudnn_dso_ver = dynload::cudnnGetVersion();
   LOG_FIRST_N(WARNING, 1) << "device: " << static_cast<int>(place.device)
@@ -189,6 +240,9 @@ void InitStream(gpuStream_t* stream) {
 #ifdef PADDLE_WITH_HIP
   PADDLE_ENFORCE_GPU_SUCCESS(
       hipStreamCreateWithPriority(stream, hipStreamDefault, 0));
+#elif defined(PADDLE_WITH_MUSA)
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      musaStreamCreateWithPriority(stream, musaStreamDefault, 0));
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(
       cudaStreamCreateWithPriority(stream, cudaStreamDefault, 0));
@@ -199,6 +253,8 @@ void DestoryStream(gpuStream_t stream) {
   if (stream != nullptr) {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(stream));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaStreamDestroy(stream));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(stream));
 #endif
@@ -210,7 +266,11 @@ void InitBlasHandle(blasHandle_t* blas_handle, gpuStream_t stream) {
 #ifdef PADDLE_WITH_HIP
   phi::dynload::rocblas_create_handle(blas_handle);
   phi::dynload::rocblas_set_stream(*blas_handle, stream);
-#else   // PADDLE_WITH_CUDA
+#elif defined(PADDLE_WITH_MUSA)
+  PADDLE_RETRY_MUSA_SUCCESS(phi::dynload::mublasCreate(blas_handle));
+  PADDLE_RETRY_MUSA_SUCCESS(
+      phi::dynload::mublasSetStream(*blas_handle, stream));
+#else   // PADDLE_WITH_MUSA
   PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasCreate(blas_handle));
   PADDLE_RETRY_CUDA_SUCCESS(
       phi::dynload::cublasSetStream(*blas_handle, stream));
@@ -223,6 +283,11 @@ void DestroyBlasHandle(blasHandle_t handle) {
     phi::dynload::rocblas_destroy_handle(handle);
     handle = nullptr;
   }
+#elif defined(PADDLE_WITH_MUSA)
+  if (handle != nullptr) {
+    phi::dynload::mublasDestroy(handle);
+    handle = nullptr;
+  }
 #else
   if (handle != nullptr) {
     phi::dynload::cublasDestroy(handle);
@@ -268,6 +333,22 @@ void InitDnnHandle(dnnHandle_t* handle, gpuStream_t stream, Place place) {
     }
     PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreate(handle));
     PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetStream(*handle, stream));
+#elif defined(PADDLE_WITH_MUSA)
+    auto local_cudnn_version = phi::dynload::mudnnGetVersion() / 100;
+    auto compile_mudnn_version = MUDNN_VERSION / 100;
+    if (local_mudnn_version < static_cast<size_t>(compile_mudnn_version)) {
+      LOG_FIRST_N(WARNING, 1)
+          << "WARNING: device: " << place.device
+          << ". The installed Paddle is compiled with MUDNN "
+          << compile_mudnn_version / 10 << "." << compile_mudnn_version % 10
+          << ", but MUDNN version in your machine is "
+          << local_mudnn_version / 10 << "." << local_mudnn_version % 10
+          << ", which may cause serious incompatible bug. "
+          << "Please recompile or reinstall Paddle with compatible MUDNN "
+             "version.";
+    }
+    PADDLE_RETRY_MUSA_SUCCESS(phi::dynload::mudnnCreate(handle));
+    PADDLE_RETRY_MUSA_SUCCESS(phi::dynload::mudnnSetStream(*handle, stream));
 #else
     auto local_cudnn_version = phi::dynload::cudnnGetVersion() / 100;
     auto compile_cudnn_version = CUDNN_VERSION / 100;
@@ -296,6 +377,11 @@ void DestroyDnnHandle(dnnHandle_t handle) {
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDestroy(handle));
     handle = nullptr;
   }
+#elif defined(PADDLE_WITH_MUSA)
+  if (handle != nullptr) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnDestroy(handle));
+    handle = nullptr;
+  }
 #else
   if (handle != nullptr) {
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDestroy(handle));
diff --git a/paddle/phi/backends/gpu/gpu_types.h b/paddle/phi/backends/gpu/gpu_types.h
index effab17059ac4..36e094f4a0814 100644
--- a/paddle/phi/backends/gpu/gpu_types.h
+++ b/paddle/phi/backends/gpu/gpu_types.h
@@ -22,6 +22,9 @@
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/backends/dynload/miopen.h"
 #include "paddle/phi/backends/dynload/rocblas.h"
+#elif defined(PADDLE_WITH_MUSA)
+#include "paddle/phi/backends/dynload/mublas.h"
+#include "paddle/phi/backends/dynload/mudnn.h"
 #else  // PADDLE_WITH_CUDA
 #include "paddle/phi/backends/dynload/cublas.h"
 #include "paddle/phi/backends/dynload/cudnn.h"
@@ -32,7 +35,9 @@ namespace phi {
 #ifdef PADDLE_WITH_HIP
 #define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
   using GPU_TYPE = ROCM_TYPE;
-
+#elif defined(PADDLE_WITH_MUSA)
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
+  using GPU_TYPE = MUSA_TYPE;
 #else  // PADDLE_WITH_CDUA
 
 #define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
@@ -56,6 +61,9 @@ DECLARE_TYPE_FOR_GPU(dnnActivationMode_t,
 #ifdef PADDLE_WITH_HIP
 #define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \
   constexpr auto GPU_CV = ROCM_CV;
+#elif defined(PADDLE_WITH_MUSA)
+#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \
+  constexpr auto GPU_CV = MUSA_CV;
 #else  // PADDLE_WITH_CUDA
 #define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \
   constexpr auto GPU_CV = CUDA_CV;
diff --git a/paddle/phi/backends/gpu/rocm/rocm_device_function.h b/paddle/phi/backends/gpu/rocm/rocm_device_function.h
index 6f5d684075f0f..0785ba2dd1cdb 100644
--- a/paddle/phi/backends/gpu/rocm/rocm_device_function.h
+++ b/paddle/phi/backends/gpu/rocm/rocm_device_function.h
@@ -132,6 +132,8 @@ __device__ T reduceSum(T val, int tid, int len) {
   // but most card's warp size is 32.
 #ifdef PADDLE_WITH_HIP
   const int warpSize = 64;
+#elif defined(PADDLE_WITH_MUSA)
+  const int warpSize = 32;
 #else
   const int warpSize = 32;
 #endif
diff --git a/paddle/phi/common/bfloat16.h b/paddle/phi/common/bfloat16.h
index 7ea9b0cbb6477..4cc21a14faac8 100644
--- a/paddle/phi/common/bfloat16.h
+++ b/paddle/phi/common/bfloat16.h
@@ -61,6 +61,13 @@ struct PADDLE_ALIGN(2) bfloat16 {
     tempRes = reinterpret_cast<uint32_t*>(&val);
     res = *tempRes;
     x = res >> 16;
+#elif defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_MUSA_BF16)
+    __nv_bfloat16 tmp = __float2bfloat16(val);
+    x = *reinterpret_cast<uint16_t*>(&tmp);
+#else
+    std::memcpy(&x, reinterpret_cast<char*>(&val) + 2, 2);
+#endif
 #else
 #if defined(PADDLE_CUDA_BF16)
     __nv_bfloat16 tmp = __float2bfloat16(val);
@@ -154,6 +161,16 @@ struct PADDLE_ALIGN(2) bfloat16 {
     uint16_t* temp_ptr = reinterpret_cast<uint16_t*>(&temp);
     res = *temp_ptr;
     return res;
+#elif defined(PADDLE_WITH_MUSA)
+#ifdef PADDLE_MUSA_BF16
+    return __bfloat162float(*reinterpret_cast<const __nv_bfloat16*>(&x));
+#else
+    float val = 0.f;
+    uint16_t temp = x;
+    std::memcpy(
+        reinterpret_cast<char*>(&val) + 2, reinterpret_cast<char*>(&temp), 2);
+    return val;
+#endif
 #else
 #ifdef PADDLE_CUDA_BF16
     return __bfloat162float(*reinterpret_cast<const __nv_bfloat16*>(&x));
diff --git a/paddle/phi/common/complex.h b/paddle/phi/common/complex.h
index 6df324c5ead11..43e513146ba0a 100644
--- a/paddle/phi/common/complex.h
+++ b/paddle/phi/common/complex.h
@@ -26,6 +26,11 @@
 #include <thrust/complex.h>
 #endif  // PADDLE_WITH_CUDA
 
+#ifdef PADDLE_WITH_MUSA
+#include <muComplex.h>
+#include <thrust/complex.h>
+#endif  // PADDLE_WITH_MUSA
+
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_complex.h>
 #include <thrust/complex.h>  // NOLINT
@@ -83,6 +88,15 @@ struct PADDLE_ALIGN(sizeof(T) * 2) complex {
   HOSTDEVICE inline explicit operator hipDoubleComplex() const {
     return make_hipDoubleComplex(real, imag);
   }
+
+#elif defined(PADDLE_WITH_MUSA)
+  HOSTDEVICE inline explicit operator muFloatComplex() const {
+    return make_muFloatComplex(real, imag);
+  }
+
+  HOSTDEVICE inline explicit operator muDoubleComplex() const {
+    return make_muDoubleComplex(real, imag);
+  }
 #else
   HOSTDEVICE inline explicit operator cuFloatComplex() const {
     return make_cuFloatComplex(real, imag);
diff --git a/paddle/phi/common/float16.h b/paddle/phi/common/float16.h
index 094fc5681c04e..572f460197f08 100644
--- a/paddle/phi/common/float16.h
+++ b/paddle/phi/common/float16.h
@@ -37,6 +37,10 @@
 #include <cuda.h>
 #endif  // PADDLE_WITH_CUDA
 
+#ifdef PADDLE_WITH_MUSA
+#include <musa.h>
+#endif  // PADDLE_WITH_MUSA
+
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
diff --git a/paddle/phi/core/cuda_stream.h b/paddle/phi/core/cuda_stream.h
index 58f08a2a36b57..87ab5e23818fb 100644
--- a/paddle/phi/core/cuda_stream.h
+++ b/paddle/phi/core/cuda_stream.h
@@ -28,6 +28,11 @@ using gpuStream_t = cudaStream_t;
 using gpuStream_t = hipStream_t;
 #endif
 
+#ifdef PADDLE_WITH_CUDA
+#include <cuda_runtime.h>
+using gpuStream_t = cudaStream_t;
+#endif
+
 #ifdef PADDLE_WITH_MUSA
 #include <musa_runtime.h>
 using gpuStream_t = musaStream_t;
@@ -152,6 +157,8 @@ class CUDAStream {
   void WaitEvent(gpuEvent_t ev) const {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(raw_stream(), ev, 0));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(raw_stream(), ev, 0));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(raw_stream(), ev, 0));
 #endif
@@ -164,6 +171,8 @@ class CUDAStream {
       backends::gpu::GPUDeviceGuard guard(place_.device);
 #ifdef PADDLE_WITH_HIP
       hipStreamDestroy(raw_stream());
+#elif defined(PADDLE_WITH_MUSA)
+      musaStreamDestroy(raw_stream());
 #else
       cudaStreamDestroy(raw_stream());
 #endif
diff --git a/paddle/phi/core/enforce.h b/paddle/phi/core/enforce.h
index d4ae30598551c..cda5a3a49c528 100644
--- a/paddle/phi/core/enforce.h
+++ b/paddle/phi/core/enforce.h
@@ -35,6 +35,16 @@ limitations under the License. */
 #include <thrust/system_error.h>
 #endif  // PADDLE_WITH_CUDA
 
+#ifdef PADDLE_WITH_MUSA
+#include <mublas_v2.h>
+#include <mudnn.h>
+#include <mufft.h>
+#include <murand.h>
+#include <musparse.h>
+#include <thrust/system/musa/error.h>
+#include <thrust/system_error.h>
+#endif  // PADDLE_WITH_MUSA
+
 #ifdef PADDLE_WITH_HIP
 #include <hiprand.h>
 #include <miopen/miopen.h>
@@ -75,6 +85,20 @@ limitations under the License. */
 #endif  // __APPLE__
 #endif  // PADDLE_WITH_CUDA
 
+
+#ifdef PADDLE_WITH_MUSA
+#include "paddle/phi/backends/dynload/mublas.h"
+#include "paddle/phi/backends/dynload/mudnn.h"
+#include "paddle/phi/backends/dynload/murand.h"
+#include "paddle/phi/backends/dynload/musolver.h"
+#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
+#include <error.h>
+
+#include "paddle/phi/backends/dynload/mccl.h"
+#endif  // __APPLE__
+#endif  // PADDLE_WITH_MUSA
+
+
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/backends/dynload/hipfft.h"
 #include "paddle/phi/backends/dynload/hiprand.h"
diff --git a/paddle/phi/core/string_tensor.cc b/paddle/phi/core/string_tensor.cc
index 98ad70622b943..1d95e16e2d9cc 100644
--- a/paddle/phi/core/string_tensor.cc
+++ b/paddle/phi/core/string_tensor.cc
@@ -117,6 +117,8 @@ void StringTensor::init_holder() {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #ifdef PADDLE_WITH_HIP
     hipMemset(ptr, 0, bytes_size);
+#elif defined(PADDLE_WITH_MUSA)
+    musaMemset(ptr, 0, bytes_size);
 #else
     cudaMemset(ptr, 0, bytes_size);
 #endif

From be2dfd9841a22212a5fe0d6e717aa155dc7717df Mon Sep 17 00:00:00 2001
From: Xiaokang Shang <xiaokang.shang@mthreds.com>
Date: Tue, 25 Jul 2023 06:57:57 +0000
Subject: [PATCH 03/55] add musa device context and allocator

---
 .../allocation/naive_best_fit_allocator.cc    |   2 +-
 paddle/fluid/platform/collective_helper.cc    |   2 +
 paddle/fluid/platform/device/gpu/gpu_info.cc  |  14 +-
 paddle/fluid/platform/device/gpu/gpu_types.h  |  39 +-
 paddle/phi/backends/gpu/gpu_context.cc        |   4 +
 paddle/phi/backends/gpu/musa/miopen_desc.h    | 264 ++++++++
 paddle/phi/backends/gpu/musa/miopen_helper.h  | 595 ++++++++++++++++++
 .../backends/gpu/musa/rocm_device_function.h  | 165 +++++
 paddle/phi/backends/gpu/musa/rocm_helper.h    |  74 +++
 paddle/phi/backends/gpu/musa/rocm_info.cc     | 334 ++++++++++
 10 files changed, 1473 insertions(+), 20 deletions(-)
 create mode 100644 paddle/phi/backends/gpu/musa/miopen_desc.h
 create mode 100644 paddle/phi/backends/gpu/musa/miopen_helper.h
 create mode 100644 paddle/phi/backends/gpu/musa/rocm_device_function.h
 create mode 100644 paddle/phi/backends/gpu/musa/rocm_helper.h
 create mode 100644 paddle/phi/backends/gpu/musa/rocm_info.cc

diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
index 27a6e3857f224..a7af040f86c5f 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -283,7 +283,7 @@ BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) {
 
 template <>
 size_t Used<platform::CUDAPlace>(const platform::CUDAPlace &place) {
-#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP)
+#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || defined PADDLE_WITH_MUSA)
   return GetGPUBuddyAllocator(place.device)->Used();
 #else
   PADDLE_THROW(platform::errors::PermissionDenied(
diff --git a/paddle/fluid/platform/collective_helper.cc b/paddle/fluid/platform/collective_helper.cc
index b133a57d523ac..a6c2b9d61dd2b 100644
--- a/paddle/fluid/platform/collective_helper.cc
+++ b/paddle/fluid/platform/collective_helper.cc
@@ -174,6 +174,8 @@ void NCCLCommContext::CreateNCCLCommMultiTrainer(
     for (int i = 0; i < kDevices; i++) {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(hipSetDevice(i));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(musaSetDevice(i));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaSetDevice(i));
 #endif
diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc
index 94c85105115d6..73fe0ca05ba73 100644
--- a/paddle/fluid/platform/device/gpu/gpu_info.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_info.cc
@@ -35,6 +35,8 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_HIP
 #include "paddle/fluid/platform/dynload/miopen.h"
+#elif defined(PADDLE_WITH_MUSA)
+//TODO(Xiaokang Shang)
 #else
 #include "paddle/fluid/platform/dynload/cudnn.h"
 #include "paddle/phi/backends/gpu/cuda/cuda_graph.h"
@@ -217,7 +219,11 @@ class RecordedGpuMallocHelper {
       result = hipMalloc(ptr, size);
     }
 #elif defined(PADDLE_WITH_MUSA)
-    result = musaMalloc(ptr, size);
+    if (UNLIKELY(malloc_managed_memory)) {
+      result = musaMallocManaged(ptr, size);
+    } else {
+      result = musaMalloc(ptr, size);
+    }
 #else
     phi::backends::gpu::CUDAGraphCaptureModeGuard capture_mode_guard;
     if (UNLIKELY(malloc_managed_memory)) {
@@ -264,9 +270,9 @@ class RecordedGpuMallocHelper {
 #ifdef PADDLE_WITH_HIP
     auto err = hipFree(ptr);
     if (err != hipErrorDeinitialized) {
-#elif define(PADDLE_WITH_MUSA)
+#elif defined(PADDLE_WITH_MUSA)
     auto err = musaFree(ptr);
-    if (err != musaErrorMusaUnloading) {
+    if (err != musaErrorInvalidValue) {
 #else
     auto err = cudaFree(ptr);
     VLOG(10) << "[cudaFree] size=" << static_cast<double>(size) / (1 << 20)
@@ -314,7 +320,7 @@ class RecordedGpuMallocHelper {
       CUDADeviceGuard guard(dev_id_);
 #ifdef PADDLE_WITH_HIP
       auto result = hipMemGetInfo(actual_avail, actual_total);
-#elif define(PADDLE_WITH_MUSA)
+#elif defined(PADDLE_WITH_MUSA)
       auto result = musaMemGetInfo(actual_avail, actual_total);
 #else
       auto result = cudaMemGetInfo(actual_avail, actual_total);
diff --git a/paddle/fluid/platform/device/gpu/gpu_types.h b/paddle/fluid/platform/device/gpu/gpu_types.h
index 83497a2507005..b3d4c7071c216 100644
--- a/paddle/fluid/platform/device/gpu/gpu_types.h
+++ b/paddle/fluid/platform/device/gpu/gpu_types.h
@@ -23,6 +23,9 @@
 #include "paddle/fluid/platform/dynload/miopen.h"
 #include "paddle/fluid/platform/dynload/rocblas.h"
 
+#elif defined(PADDLE_WITH_MUSA)
+#include <musa_runtime.h>
+//TODO(Xiaokang Shang)
 #else
 #include <cuda_runtime.h>
 
@@ -34,24 +37,30 @@
 namespace paddle {
 
 #ifdef PADDLE_WITH_HIP
-#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \
   using GPU_TYPE = ROCM_TYPE;
+
+#elif defined(PADDLE_WITH_MUSA)
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \
+  using GPU_TYPE = MUSA_TYPE;
 #else  // CDUA
 
-#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \
   using GPU_TYPE = CUDA_TYPE;
 #endif
 
-DECLARE_TYPE_FOR_GPU(gpuStream_t, cudaStream_t, hipStream_t);
-DECLARE_TYPE_FOR_GPU(gpuError_t, cudaError_t, hipError_t);
-DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t);
-DECLARE_TYPE_FOR_GPU(gpuMemcpyKind, cudaMemcpyKind, hipMemcpyKind);
-DECLARE_TYPE_FOR_GPU(gpuDeviceProp, cudaDeviceProp, hipDeviceProp_t);
+DECLARE_TYPE_FOR_GPU(gpuStream_t, cudaStream_t, hipStream_t, musaStream_t);
+DECLARE_TYPE_FOR_GPU(gpuError_t, cudaError_t, hipError_t, musaError_t);
+DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t, musaEvent_T);
+DECLARE_TYPE_FOR_GPU(gpuMemcpyKind, cudaMemcpyKind, hipMemcpyKind, musaMemcpyKind);
+DECLARE_TYPE_FOR_GPU(gpuDeviceProp, cudaDeviceProp, hipDeviceProp_t, musaDeviceProp_t);
 
-DECLARE_TYPE_FOR_GPU(dnnDataType_t, cudnnDataType_t, miopenDataType_t);
+// TODO(Xiaokang Shang): confirm mudnn type
+DECLARE_TYPE_FOR_GPU(dnnDataType_t, cudnnDataType_t, miopenDataType_t, mudnnDataType_t);
 DECLARE_TYPE_FOR_GPU(dnnActivationDescriptor,
                      cudnnActivationStruct,
-                     miopenActivationDescriptor);
+                     miopenActivationDescriptor,
+                     mudnnActivationStruct);
 DECLARE_TYPE_FOR_GPU(dnnActivationMode_t,
                      cudnnActivationMode_t,
                      miopenActivationMode_t);
@@ -80,9 +89,9 @@ DECLARE_TYPE_FOR_GPU(dnnPoolingMode_t, cudnnPoolingMode_t, miopenPoolingMode_t);
 DECLARE_TYPE_FOR_GPU(dnnDropoutDescriptor_t,
                      cudnnDropoutDescriptor_t,
                      miopenDropoutDescriptor_t);
-DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t);
+DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t, mudnnHandle_t);
 
-DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle);
+DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle, mublasHandle_t);
 
 // TODO(Ming Huang): Since there is no blasLt handler,
 // use rocblas_handle for workround.
@@ -93,21 +102,21 @@ using CUDAGraphID = unsigned long long;  // NOLINT
 #undef DECLARE_TYPE_FOR_GPU
 
 #ifdef PADDLE_WITH_HIP
-#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \
+#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \
   constexpr auto GPU_CV = ROCM_CV;
-#elif PADDLE_WITH_MUSA
+#elif defined(PADDLE_WITH_MUSA)
 #define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \
   constexpr auto GPU_CV = MUSA_CV;
 #else  // CDUA
 
-#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \
+#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \
   constexpr auto GPU_CV = CUDA_CV;
 #endif
 
 DECLARE_CONSTANT_FOR_GPU(gpuErrorOutOfMemory,
                          cudaErrorMemoryAllocation,
                          hipErrorOutOfMemory,
-                         musaErrorMemoryAllocation);
+                         musaErrorOutOfMemory);
 DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady, cudaErrorNotReady, hipErrorNotReady, musaErrorNotReady);
 DECLARE_CONSTANT_FOR_GPU(gpuSuccess, cudaSuccess, hipSuccess, musaSuccess);
 
diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc
index 9291f3d00d8f3..e954c7db337aa 100644
--- a/paddle/phi/backends/gpu/gpu_context.cc
+++ b/paddle/phi/backends/gpu/gpu_context.cc
@@ -62,6 +62,10 @@ limitations under the License. */
 #endif  // !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
 #endif  // PADDLE_WITH_HIP
 
+#ifdef PADDLE_WITH_MUSA
+
+#endif
+
 // NOTE: The paddle framework should add WITH_EIGEN option to support compile
 // without eigen.
 #include "unsupported/Eigen/CXX11/Tensor"
diff --git a/paddle/phi/backends/gpu/musa/miopen_desc.h b/paddle/phi/backends/gpu/musa/miopen_desc.h
new file mode 100644
index 0000000000000..ae0e274ca650e
--- /dev/null
+++ b/paddle/phi/backends/gpu/musa/miopen_desc.h
@@ -0,0 +1,264 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <functional>
+#include <iostream>
+#include <iterator>
+#include <memory>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#include "paddle/phi/backends/gpu/rocm/miopen_helper.h"
+#include "paddle/phi/core/utils/data_type.h"
+
+namespace phi {
+namespace backends {
+namespace gpu {
+
+inline std::vector<int> TransformDimOrder(const std::vector<int>& dims) {
+  std::vector<int> transformed_dims(dims.begin(), dims.end());
+  int H, W, D, C;
+  if (dims.size() == 4) {
+    H = dims[1];
+    W = dims[2];
+    C = dims[3];
+    transformed_dims[1] = C;
+    transformed_dims[2] = H;
+    transformed_dims[3] = W;
+  } else {
+    D = dims[1];
+    H = dims[2];
+    W = dims[3];
+    C = dims[4];
+    transformed_dims[1] = C;
+    transformed_dims[2] = D;
+    transformed_dims[3] = H;
+    transformed_dims[4] = W;
+  }
+  return transformed_dims;
+}
+
+inline miopenDataType_t ToCudnnDataType(const phi::DataType& t) {
+  miopenDataType_t type = miopenFloat;
+  switch (t) {
+    case phi::DataType::FLOAT16:
+      type = miopenHalf;
+      break;
+    case phi::DataType::FLOAT32:
+      type = miopenFloat;
+      break;
+    default:
+      break;
+  }
+  return type;
+}
+
+class ActivationDescriptor {
+ public:
+  using T = miopenActivationDescriptor;
+  struct Deleter {
+    void operator()(T* t) {
+      if (t != nullptr) {
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            phi::dynload::miopenDestroyActivationDescriptor(t));
+        t = nullptr;
+      }
+    }
+  };
+  ActivationDescriptor() {
+    T* raw_ptr;
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::miopenCreateActivationDescriptor(&raw_ptr));
+    desc_.reset(raw_ptr);
+  }
+  template <typename T>
+  void set(miopenActivationMode_t mode, const T& coef) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetActivationDescriptor(
+        desc_.get(), mode, static_cast<double>(coef), 0.0, 0.0));
+  }
+
+  T* desc() { return desc_.get(); }
+  T* desc() const { return desc_.get(); }
+
+ private:
+  std::unique_ptr<T, Deleter> desc_;
+};
+
+class TensorDescriptor {
+ public:
+  using T = miopenTensorDescriptor;
+  struct Deleter {
+    void operator()(T* t) {
+      if (t != nullptr) {
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            phi::dynload::miopenDestroyTensorDescriptor(t));
+        t = nullptr;
+      }
+    }
+  };
+  TensorDescriptor() {
+    T* raw_ptr;
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::miopenCreateTensorDescriptor(&raw_ptr));
+    desc_.reset(raw_ptr);
+  }
+  T* desc() { return desc_.get(); }
+  T* desc() const { return desc_.get(); }
+
+  void set(const phi::DenseTensor& tensor, const int groups = 1) {
+    auto dims = phi::vectorize<int>(tensor.dims());
+    std::vector<int> strides(dims.size());
+    strides[dims.size() - 1] = 1;
+    for (int i = dims.size() - 2; i >= 0; i--) {
+      strides[i] = dims[i + 1] * strides[i + 1];
+    }
+    std::vector<int> dims_with_group(dims.begin(), dims.end());
+    if (groups > 1) {
+      dims_with_group[1] = dims_with_group[1] / groups;
+    }
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor(
+        (miopenTensorDescriptor_t)(desc_.get()),
+        ToCudnnDataType(tensor.dtype()),
+        static_cast<int>(dims_with_group.size()),
+        const_cast<int*>(dims_with_group.data()),
+        const_cast<int*>(strides.data())));
+  }
+
+  void set(const phi::DenseTensor& tensor, const miopenTensorFormat_t format) {
+    const int groups = 1;
+    PADDLE_ENFORCE_EQ(
+        format,
+        MIOPEN_TENSOR_NCHW,
+        phi::errors::InvalidArgument("format should ONLY be NCHW in MIOPEN."));
+    auto dims = phi::vectorize<int>(tensor.dims());
+    std::vector<int> strides(dims.size());
+    strides[dims.size() - 1] = 1;
+    for (int i = dims.size() - 2; i >= 0; i--) {
+      strides[i] = dims[i + 1] * strides[i + 1];
+    }
+    std::vector<int> dims_with_group(dims.begin(), dims.end());
+    if (groups > 1) {
+      dims_with_group[1] = dims_with_group[1] / groups;
+    }
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor(
+        (miopenTensorDescriptor_t)(desc_.get()),
+        ToCudnnDataType(tensor.dtype()),
+        static_cast<int>(dims_with_group.size()),
+        const_cast<int*>(dims_with_group.data()),
+        const_cast<int*>(strides.data())));
+  }
+
+ private:
+  std::unique_ptr<T, Deleter> desc_;
+};
+
+class FilterDescriptor {
+ public:
+  using T = miopenTensorDescriptor;
+  struct Deleter {
+    void operator()(T* t) {
+      if (t != nullptr) {
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            phi::dynload::miopenDestroyTensorDescriptor(t));
+        t = nullptr;
+      }
+    }
+  };
+  FilterDescriptor() {
+    T* raw_ptr;
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::miopenCreateTensorDescriptor(&raw_ptr));
+    desc_.reset(raw_ptr);
+  }
+  T* desc() { return desc_.get(); }
+  T* desc() const { return desc_.get(); }
+
+  void set(const phi::DenseTensor& tensor,
+           const miopenTensorFormat_t format,
+           const int groups = 1) {
+    PADDLE_ENFORCE_EQ(
+        format,
+        MIOPEN_TENSOR_NCHW,
+        phi::errors::InvalidArgument("format should ONLY be NCHW in MIOPEN."));
+    auto dims = phi::vectorize<int>(tensor.dims());
+    std::vector<int> strides(dims.size());
+    strides[dims.size() - 1] = 1;
+    for (int i = dims.size() - 2; i >= 0; i--) {
+      strides[i] = dims[i + 1] * strides[i + 1];
+    }
+    std::vector<int> dims_with_group(dims.begin(), dims.end());
+    if (groups > 1) {
+      dims_with_group[1] = dims_with_group[1] / groups;
+    }
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor(
+        (miopenTensorDescriptor_t)(desc_.get()),
+        ToCudnnDataType(tensor.dtype()),
+        static_cast<int>(dims_with_group.size()),
+        const_cast<int*>(dims_with_group.data()),
+        const_cast<int*>(strides.data())));
+  }
+
+ private:
+  std::unique_ptr<T, Deleter> desc_;
+};
+
+class ConvolutionDescriptor {
+ public:
+  using T = miopenConvolutionDescriptor;
+  struct Deleter {
+    void operator()(T* t) {
+      if (t != nullptr) {
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            phi::dynload::miopenDestroyConvolutionDescriptor(t));
+        t = nullptr;
+      }
+    }
+  };
+  ConvolutionDescriptor() {
+    T* raw_ptr;
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::miopenCreateConvolutionDescriptor(&raw_ptr));
+    desc_.reset(raw_ptr);
+  }
+  T* desc() { return desc_.get(); }
+  T* desc() const { return desc_.get(); }
+
+  void set(miopenDataType_t dtype,
+           const std::vector<int>& pads,
+           const std::vector<int>& strides,
+           const std::vector<int>& dilations,
+           bool allow_tf32,
+           const int groups = 1) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenInitConvolutionNdDescriptor(
+        (miopenConvolutionDescriptor_t)desc_.get(),
+        static_cast<int>(pads.size()),
+        const_cast<int*>(pads.data()),
+        const_cast<int*>(strides.data()),
+        const_cast<int*>(dilations.data()),
+        miopenConvolution));
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetConvolutionGroupCount(
+        (miopenConvolutionDescriptor_t)desc_.get(), groups));
+  }
+
+ private:
+  std::unique_ptr<T, Deleter> desc_;
+};
+
+}  // namespace gpu
+}  // namespace backends
+}  // namespace phi
diff --git a/paddle/phi/backends/gpu/musa/miopen_helper.h b/paddle/phi/backends/gpu/musa/miopen_helper.h
new file mode 100644
index 0000000000000..095f32ba460d0
--- /dev/null
+++ b/paddle/phi/backends/gpu/musa/miopen_helper.h
@@ -0,0 +1,595 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "gflags/gflags.h"
+
+#include "paddle/phi/backends/dynload/miopen.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/errors.h"
+#include "paddle/phi/core/macros.h"
+
+// MIOPEN do not have epslion definition
+#define CUDNN_BN_MIN_EPSILON 1e-05
+
+DECLARE_bool(cudnn_deterministic);
+
+namespace phi {
+namespace backends {
+namespace gpu {
+
+inline const char* miopenGetErrorString(miopenStatus_t status) {
+  switch (status) {
+    case miopenStatusSuccess:
+      return "miopenStatusSuccess";
+    case miopenStatusNotInitialized:
+      return "miopenStatusNotInitialized";
+    case miopenStatusAllocFailed:
+      return "miopenStatusAllocFailed";
+    case miopenStatusBadParm:
+      return "miopenStatusBadParm";
+    case miopenStatusInternalError:
+      return "miopenStatusInternalError";
+    case miopenStatusInvalidValue:
+      return "miopenStatusInvalidValue";
+    case miopenStatusUnknownError:
+      return "miopenStatusUnknownError";
+    case miopenStatusNotImplemented:
+      return "miopenStatusNotImplemented";
+    default:
+      return "Unknown miopen error number";
+  }
+}
+
+// no use, but will have compiling error if not defined
+#define CUDNN_VERSION_MIN(major, minor, patch) \
+  (CUDNN_VERSION >= ((major)*1000 + (minor)*100 + (patch)))
+
+enum class DataLayout {  // Not use
+  kNHWC,
+  kNCHW,
+  kNCDHW,
+  kNDHWC,  // add, liyamei
+  kNCHW_VECT_C,
+};
+
+enum class PoolingMode {
+  kMaximum,
+  kMaximumDeterministic,
+  kAverageExclusive,
+  kAverageInclusive,
+};
+
+enum class ActivationMode {
+  kNone,  // activation identity
+  kSigmoid,
+  kRelu,
+  kRelu6,
+  kReluX,
+  kTanh,
+  kBandPass,
+};
+
+inline miopenPoolingMode_t GetPoolingMode(const PoolingMode& mode) {
+  switch (mode) {
+    case PoolingMode::kMaximumDeterministic:
+      return miopenPoolingMax;
+    case PoolingMode::kAverageExclusive:
+      return miopenPoolingAverage;
+    case PoolingMode::kAverageInclusive:
+      return miopenPoolingAverageInclusive;
+    case PoolingMode::kMaximum:
+      return miopenPoolingMax;
+    default:
+      PADDLE_THROW(
+          phi::errors::Unimplemented("Unexpected MIOPEN pooling mode."));
+  }
+}
+
+inline ActivationMode StringToActivationMode(const std::string& str) {
+  if (str == "identity") {
+    return ActivationMode::kNone;
+  } else if (str == "sigmoid") {
+    return ActivationMode::kSigmoid;
+  } else if (str == "relu") {
+    return ActivationMode::kRelu;
+  } else if (str == "relu6") {
+    return ActivationMode::kRelu6;
+  } else if (str == "relux") {
+    return ActivationMode::kReluX;
+  } else if (str == "tanh") {
+    return ActivationMode::kTanh;
+  } else if (str == "bandpass") {
+    return ActivationMode::kBandPass;
+  } else {
+    PADDLE_THROW(phi::errors::Unimplemented(
+        "Unknown MIOPEN activation string: %s.", str));
+  }
+}
+
+template <typename T>
+class CudnnDataType;
+
+template <>
+class CudnnDataType<phi::dtype::float16> {
+ public:
+  static const miopenDataType_t type = miopenHalf;
+  // The scaling param type is float for HALF and FLOAT tensors
+  using ScalingParamType = const float;
+  using BatchNormParamType = float;
+  static ScalingParamType* kOne() {
+    static ScalingParamType v = 1.0;
+    return &v;
+  }
+  static ScalingParamType* kZero() {
+    static ScalingParamType v = 0.0;
+    return &v;
+  }
+};
+
+template <>
+class CudnnDataType<phi::dtype::bfloat16> {
+ public:
+  static const miopenDataType_t type = miopenBFloat16;
+  // The scaling param type is float for HALF and FLOAT tensors
+  using ScalingParamType = const float;
+  using BatchNormParamType = float;
+  static ScalingParamType* kOne() {
+    static ScalingParamType v = 1.0;
+    return &v;
+  }
+  static ScalingParamType* kZero() {
+    static ScalingParamType v = 0.0;
+    return &v;
+  }
+};
+
+template <>
+class CudnnDataType<float> {
+ public:
+  static const miopenDataType_t type = miopenFloat;
+  using ScalingParamType = const float;
+  using BatchNormParamType = float;
+  static ScalingParamType* kOne() {
+    static ScalingParamType v = 1.0;
+    return &v;
+  }
+  static ScalingParamType* kZero() {
+    static ScalingParamType v = 0.0;
+    return &v;
+  }
+};
+
+inline miopenTensorFormat_t GetCudnnTensorFormat(const DataLayout& order) {
+  switch (order) {
+    case DataLayout::kNHWC:
+      return MIOPEN_TENSOR_NHWC;
+    case DataLayout::kNCHW:
+      return MIOPEN_TENSOR_NCHW;
+    case DataLayout::kNCDHW:
+      return MIOPEN_TENSOR_NCHW;
+    case DataLayout::kNDHWC:
+      return MIOPEN_TENSOR_NHWC;
+    default:
+      PADDLE_THROW(phi::errors::Unimplemented(
+          "MIOPEN has no equivalent dataLayout for input order."));
+  }
+  return MIOPEN_TENSOR_NCHW;
+}
+
+class ScopedTensorDescriptor {
+ public:
+  ScopedTensorDescriptor() {
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::miopenCreateTensorDescriptor(&desc_));
+  }
+  ~ScopedTensorDescriptor() PADDLE_MAY_THROW {
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::miopenDestroyTensorDescriptor(desc_));
+  }
+
+  inline miopenTensorDescriptor_t descriptor(const miopenTensorFormat_t format,
+                                             const miopenDataType_t type,
+                                             const std::vector<int>& dims,
+                                             const int groups = 1) {
+    // the format is not used now, will add later
+    std::vector<int> strides(dims.size());
+    strides[dims.size() - 1] = 1;
+    for (int i = dims.size() - 2; i >= 0; i--) {
+      strides[i] = dims[i + 1] * strides[i + 1];
+    }
+    // Update tensor descriptor dims setting if groups > 1
+    // NOTE: Here, Assume using NCHW or NCDHW order
+    std::vector<int> dims_with_group(dims.begin(), dims.end());
+    if (groups > 1) {
+      dims_with_group[1] = dims_with_group[1] / groups;
+    }
+
+    // MIOPEN ONLY support data layout of NCHW
+    PADDLE_ENFORCE_EQ(
+        format,
+        MIOPEN_TENSOR_NCHW,
+        phi::errors::InvalidArgument("format should ONLY be NCHW in MIOPEN."));
+    if (dims.size() == 4) {
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor(
+          desc_,
+          type,
+          dims_with_group.size(),
+          const_cast<int*>(dims_with_group.data()),
+          const_cast<int*>(strides.data())));
+    } else if (dims.size() == 5) {
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor(
+          desc_,
+          type,
+          dims_with_group.size(),
+          const_cast<int*>(dims_with_group.data()),
+          const_cast<int*>(strides.data())));
+    }
+    return desc_;
+  }
+
+  template <typename T>
+  inline miopenTensorDescriptor_t descriptor(const DataLayout& order,
+                                             const std::vector<int>& dims,
+                                             const int groups = 1) {
+    return descriptor(
+        GetCudnnTensorFormat(order), CudnnDataType<T>::type, dims, groups);
+  }
+
+  inline miopenTensorDescriptor_t descriptor(const miopenDataType_t miopen_type,
+                                             const std::vector<int>& dim,
+                                             const std::vector<int>& stride) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor(
+        desc_,
+        miopen_type,
+        dim.size(),
+        const_cast<int*>(dim.data()),
+        const_cast<int*>(stride.data())));
+    return desc_;
+  }
+
+  template <typename T>
+  inline miopenTensorDescriptor_t descriptor(const std::vector<int>& dim,
+                                             const std::vector<int>& stride) {
+    return descriptor(CudnnDataType<T>::type, dim, stride);
+  }
+
+  inline miopenTensorDescriptor_t desc() { return desc_; }
+
+ private:
+  miopenTensorDescriptor_t desc_;
+  DISABLE_COPY_AND_ASSIGN(ScopedTensorDescriptor);
+};
+
+class ScopedDropoutDescriptor {
+ public:
+  ScopedDropoutDescriptor() {
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::miopenCreateDropoutDescriptor(&desc_));
+  }
+  ~ScopedDropoutDescriptor() PADDLE_MAY_THROW {
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::miopenDestroyDropoutDescriptor(desc_));
+  }
+
+  inline miopenDropoutDescriptor_t descriptor(const miopenHandle_t& handle,
+                                              const phi::Place& place,
+                                              bool initialized,
+                                              float dropout_prob_,
+                                              phi::DenseTensor* dropout_state_,
+                                              int seed,
+                                              size_t state_size) {
+    if (dropout_state_ == nullptr) {  // for no dropout or test
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          phi::dynload::miopenSetDropoutDescriptor(desc_,
+                                                   handle,
+                                                   0 /* dropout */,
+                                                   nullptr,
+                                                   0 /* state_size */,
+                                                   0 /* seed */,
+                                                   false,
+                                                   false,
+                                                   MIOPEN_RNG_PSEUDO_XORWOW));
+      return desc_;
+    }
+    auto* dropout_state_data = dropout_state_->data<uint8_t>();
+    if (!initialized) {
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          phi::dynload::miopenSetDropoutDescriptor(desc_,
+                                                   handle,
+                                                   dropout_prob_,
+                                                   dropout_state_data,
+                                                   state_size,
+                                                   seed,
+                                                   false,
+                                                   false,
+                                                   MIOPEN_RNG_PSEUDO_XORWOW));
+    } else {
+      auto dropout_state_dims = dropout_state_->dims();
+      state_size = dropout_state_dims[0];
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenRestoreDropoutDescriptor(
+          desc_,
+          handle,
+          dropout_prob_,
+          dropout_state_data,
+          state_size,
+          0,
+          false,
+          false,
+          MIOPEN_RNG_PSEUDO_XORWOW));
+    }
+    return desc_;
+  }
+  inline miopenDropoutDescriptor_t desc() { return desc_; }
+
+ private:
+  miopenDropoutDescriptor_t desc_;
+  DISABLE_COPY_AND_ASSIGN(ScopedDropoutDescriptor);
+};
+
+class ScopedRNNDescriptor {
+ public:
+  ScopedRNNDescriptor() {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenCreateRNNDescriptor(&desc_));
+  }
+  ~ScopedRNNDescriptor() PADDLE_MAY_THROW {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDestroyRNNDescriptor(desc_));
+  }
+
+  inline miopenRNNDescriptor_t desc() { return desc_; }
+
+ private:
+  miopenRNNDescriptor_t desc_;
+  DISABLE_COPY_AND_ASSIGN(ScopedRNNDescriptor);
+};
+
+class ScopedFilterDescriptor {
+ public:
+  ScopedFilterDescriptor() {
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::miopenCreateTensorDescriptor(&desc_));
+  }
+  ~ScopedFilterDescriptor() PADDLE_MAY_THROW {
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::miopenDestroyTensorDescriptor(desc_));
+  }
+
+  inline miopenTensorDescriptor_t descriptor(const miopenTensorFormat_t format,
+                                             const miopenDataType_t type,
+                                             const std::vector<int>& kernel,
+                                             const int groups = 1) {
+    // filter layout: MCHW(MCDHW), where M is the number of
+    // output image channels, C is the number of input image channels,
+    // D is the depth of the filter, H is the height of the filter, and W is the
+    // width of the filter.
+    std::vector<int> kernel_with_group(kernel.begin(), kernel.end());
+    if (groups > 1) {
+      kernel_with_group[0] /= groups;
+      // NOTE: input filter(C) of the filter is already asserted to be C/groups.
+    }
+    std::vector<int> stride_dim(kernel_with_group.size());
+    stride_dim.push_back(1);
+    for (int k = kernel_with_group.size() - 2; k >= 0; k--) {
+      stride_dim[k] = stride_dim[k + 1] * kernel_with_group[k + 1];
+    }
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor(
+        desc_,
+        type,
+        kernel_with_group.size(),
+        const_cast<int*>(kernel_with_group.data()),
+        const_cast<int*>(stride_dim.data())));
+    return desc_;
+  }
+
+  template <typename T>
+  inline miopenTensorDescriptor_t descriptor(const DataLayout& order,
+                                             const std::vector<int>& kernel,
+                                             const int groups = 1) {
+    return descriptor(
+        GetCudnnTensorFormat(order), CudnnDataType<T>::type, kernel, groups);
+  }
+
+  inline miopenTensorDescriptor_t desc() { return desc_; }
+
+ private:
+  miopenTensorDescriptor_t desc_;
+  DISABLE_COPY_AND_ASSIGN(ScopedFilterDescriptor);
+};
+
+class ScopedConvolutionDescriptor {
+ public:
+  ScopedConvolutionDescriptor() {
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::miopenCreateConvolutionDescriptor(&desc_));
+  }
+  ~ScopedConvolutionDescriptor() PADDLE_MAY_THROW {
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::miopenDestroyConvolutionDescriptor(desc_));
+  }
+
+  inline miopenConvolutionDescriptor_t descriptor(
+      miopenDataType_t type,
+      const std::vector<int>& pads,
+      const std::vector<int>& strides,
+      const std::vector<int>& dilations) {
+    PADDLE_ENFORCE_EQ(pads.size(),
+                      strides.size(),
+                      phi::errors::InvalidArgument(
+                          "The size of pads and strides should be equal. But "
+                          "received size of pads is %d, size of strides is %d.",
+                          pads.size(),
+                          strides.size()));
+    PADDLE_ENFORCE_EQ(
+        pads.size(),
+        dilations.size(),
+        phi::errors::InvalidArgument(
+            "The size of pads and dilations should be equal. But received size "
+            "of pads is %d, size of dilations is %d.",
+            pads.size(),
+            dilations.size()));
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenInitConvolutionNdDescriptor(
+        desc_,
+        pads.size(),
+        const_cast<int*>(pads.data()),
+        const_cast<int*>(strides.data()),
+        const_cast<int*>(dilations.data()),
+        miopenConvolution));
+    return desc_;
+  }
+
+  template <typename T>
+  inline miopenConvolutionDescriptor_t descriptor(
+      const std::vector<int>& pads,
+      const std::vector<int>& strides,
+      const std::vector<int>& dilations) {
+    return descriptor(CudnnDataType<T>::type, pads, strides, dilations);
+  }
+
+ private:
+  miopenConvolutionDescriptor_t desc_;
+  DISABLE_COPY_AND_ASSIGN(ScopedConvolutionDescriptor);
+};
+
+class ScopedPoolingDescriptor {
+ public:
+  ScopedPoolingDescriptor() {
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::miopenCreatePoolingDescriptor(&desc_));
+  }
+  ~ScopedPoolingDescriptor() PADDLE_MAY_THROW {
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::miopenDestroyPoolingDescriptor(desc_));
+  }
+
+  inline miopenPoolingDescriptor_t descriptor(const PoolingMode& mode,
+                                              const std::vector<int>& kernel,
+                                              const std::vector<int>& pads,
+                                              const std::vector<int>& strides) {
+    PADDLE_ENFORCE_EQ(kernel.size(),
+                      pads.size(),
+                      phi::errors::InvalidArgument(
+                          "The size of kernel and pads should be equal. But "
+                          "received size of kernel is %d, size of pads is %d.",
+                          kernel.size(),
+                          pads.size()));
+    PADDLE_ENFORCE_EQ(
+        kernel.size(),
+        strides.size(),
+        phi::errors::InvalidArgument(
+            "The size of kernel and strides should be equal. But "
+            "received size of kernel is %d, size of strides is %d.",
+            kernel.size(),
+            strides.size()));
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetNdPoolingDescriptor(
+        desc_,
+        GetPoolingMode(mode),
+        kernel.size(),
+        const_cast<int*>(kernel.data()),
+        const_cast<int*>(pads.data()),
+        const_cast<int*>(strides.data())));
+    return desc_;
+  }
+
+ private:
+  miopenPoolingDescriptor_t desc_;
+  DISABLE_COPY_AND_ASSIGN(ScopedPoolingDescriptor);
+};
+
+class ScopedActivationDescriptor {
+ public:
+  ScopedActivationDescriptor() {
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::miopenCreateActivationDescriptor(&desc_));
+  }
+  ~ScopedActivationDescriptor() PADDLE_MAY_THROW {
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::miopenDestroyActivationDescriptor(desc_));
+  }
+
+  template <typename T>
+  inline miopenActivationDescriptor_t descriptor(
+      const std::string& act, double value_max = static_cast<double>(0.)) {
+    double relu_ceiling = 0.0;
+    ActivationMode activation_mode = StringToActivationMode(act);
+    miopenActivationMode_t mode;
+    switch (activation_mode) {
+      case ActivationMode::kNone:
+        mode = miopenActivationPASTHRU;
+        break;
+      case ActivationMode::kRelu6:
+        relu_ceiling = 6.0;
+        mode = miopenActivationCLIPPEDRELU;
+        break;
+      case ActivationMode::kReluX:
+        relu_ceiling = value_max;
+        mode = miopenActivationCLIPPEDRELU;
+        break;
+      case ActivationMode::kRelu:
+        mode = miopenActivationRELU;
+        break;
+      case ActivationMode::kSigmoid:
+        mode = miopenActivationLOGISTIC;
+        break;
+      case ActivationMode::kTanh:
+        mode = miopenActivationTANH;
+        break;
+      default:
+        PADDLE_THROW(phi::errors::Unimplemented(
+            "Unrecognized MIOPEN activation mode: %d.",
+            static_cast<int>(activation_mode)));
+    }
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetActivationDescriptor(
+        desc_, mode, relu_ceiling, 0.0, 0.0));
+    return desc_;
+  }
+
+ private:
+  miopenActivationDescriptor_t desc_;
+  DISABLE_COPY_AND_ASSIGN(ScopedActivationDescriptor);
+};
+
+class ScopedCTCLossDescriptor {
+ public:
+  ScopedCTCLossDescriptor() {
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::miopenCreateCTCLossDescriptor(&desc_));
+  }
+  ~ScopedCTCLossDescriptor() PADDLE_MAY_THROW {
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::miopenDestroyCTCLossDescriptor(desc_));
+  }
+
+  template <typename T>
+  inline miopenCTCLossDescriptor_t descriptor() {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetCTCLossDescriptor(
+        desc_, CudnnDataType<T>::type, 0, false));
+    return desc_;
+  }
+
+ private:
+  miopenCTCLossDescriptor_t desc_;
+  DISABLE_COPY_AND_ASSIGN(ScopedCTCLossDescriptor);
+};
+
+}  // namespace gpu
+}  // namespace backends
+}  // namespace phi
diff --git a/paddle/phi/backends/gpu/musa/rocm_device_function.h b/paddle/phi/backends/gpu/musa/rocm_device_function.h
new file mode 100644
index 0000000000000..6f5d684075f0f
--- /dev/null
+++ b/paddle/phi/backends/gpu/musa/rocm_device_function.h
@@ -0,0 +1,165 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+// NOTE(): support float16 to half in header file.
+#define PADDLE_CUDA_FP16
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/float16.h"
+
+namespace phi {
+namespace backends {
+namespace gpu {
+
+#define CREATE_SHFL_MASK(mask, predicate) mask = __ballot((predicate))
+
+#define CUDA_LAUNCH_KERNEL_BASE(dim, ...)  \
+  case (dim): {                            \
+    constexpr auto kPowerOfTwoDim = (dim); \
+    __VA_ARGS__;                           \
+  } break
+
+#define CUDA_LAUNCH_KERNEL_HELPER(...)          \
+  CUDA_LAUNCH_KERNEL_BASE(1024, ##__VA_ARGS__); \
+  CUDA_LAUNCH_KERNEL_BASE(512, ##__VA_ARGS__);  \
+  CUDA_LAUNCH_KERNEL_BASE(256, ##__VA_ARGS__);  \
+  CUDA_LAUNCH_KERNEL_BASE(128, ##__VA_ARGS__);  \
+  CUDA_LAUNCH_KERNEL_BASE(64, ##__VA_ARGS__);   \
+  CUDA_LAUNCH_KERNEL_BASE(32, ##__VA_ARGS__);
+
+template <typename T>
+__forceinline__ __device__ T
+CudaShuffleDownSync(unsigned mask, T val, int delta, int width = warpSize) {
+  return __shfl_down(val, delta, width);
+}
+
+template <typename T>
+__forceinline__ __device__ T CudaShuffleXorSync(unsigned mask,
+                                                T val,
+                                                int width = warpSize) {
+  return __shfl_xor(val, width);
+}
+
+template <>
+__forceinline__ __device__ phi::dtype::float16 CudaShuffleDownSync(
+    unsigned mask, phi::dtype::float16 val, int delta, int width) {
+  return phi::dtype::float16(__shfl_down(
+      static_cast<float>(val), static_cast<unsigned>(delta), width));
+}
+
+template <>
+__forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleDownSync(
+    unsigned mask, phi::dtype::bfloat16 val, int delta, int width) {
+  return phi::dtype::bfloat16(__shfl_down(
+      static_cast<float>(val), static_cast<unsigned>(delta), width));
+}
+
+template <>
+__forceinline__ __device__ phi::dtype::complex<float> CudaShuffleDownSync(
+    unsigned mask, phi::dtype::complex<float> val, int delta, int width) {
+  float real = __shfl_down(val.real, delta, width);
+  float imag = __shfl_down(val.imag, delta, width);
+  return phi::dtype::complex<float>(real, imag);
+}
+
+template <>
+__forceinline__ __device__ phi::dtype::complex<double> CudaShuffleDownSync(
+    unsigned mask, phi::dtype::complex<double> val, int delta, int width) {
+  double real = __shfl_down(val.real, delta, width);
+  double imag = __shfl_down(val.imag, delta, width);
+  return phi::dtype::complex<double>(real, imag);
+}
+
+template <>
+__forceinline__ __device__ phi::dtype::float16 CudaShuffleXorSync(
+    unsigned mask, phi::dtype::float16 val, int width) {
+  return phi::dtype::float16(__shfl_xor(static_cast<float>(val), width));
+}
+
+template <>
+__forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleXorSync(
+    unsigned mask, phi::dtype::bfloat16 val, int width) {
+  return phi::dtype::bfloat16(__shfl_xor(static_cast<float>(val), width));
+}
+
+template <>
+__forceinline__ __device__ phi::dtype::complex<float> CudaShuffleXorSync(
+    unsigned mask, phi::dtype::complex<float> val, int width) {
+  float real = __shfl_xor(val.real, width);
+  float imag = __shfl_xor(val.imag, width);
+  return phi::dtype::complex<float>(real, imag);
+}
+
+template <>
+__forceinline__ __device__ phi::dtype::complex<double> CudaShuffleXorSync(
+    unsigned mask, phi::dtype::complex<double> val, int width) {
+  double real = __shfl_xor(val.real, width);
+  double imag = __shfl_xor(val.imag, width);
+  return phi::dtype::complex<double>(real, imag);
+}
+
+template <typename T>
+__forceinline__ __device__ T
+CudaShuffleSync(unsigned mask, T val, int src_line, int width = 32) {
+  return __shfl(val, src_line, width);
+}
+
+template <typename T>
+HOSTDEVICE T Infinity() {
+  return INFINITY;
+}
+
+template <typename T>
+__device__ T reduceSum(T val, int tid, int len) {
+  // NOTE(zcd): The warp size should be taken from the
+  // parameters of the GPU but not specified as 32 simply.
+  // To make the reduceSum more efficiently,
+  // I use Warp-Level Parallelism and assume the Warp size
+  // is 32 which may be different for different GPU,
+  // but most card's warp size is 32.
+#ifdef PADDLE_WITH_HIP
+  const int warpSize = 64;
+#else
+  const int warpSize = 32;
+#endif
+  __shared__ T shm[warpSize];
+  unsigned mask = 0u;
+  CREATE_SHFL_MASK(mask, tid < len);
+
+  for (int offset = warpSize / 2; offset > 0; offset /= 2)
+    val += phi::backends::gpu::CudaShuffleDownSync(mask, val, offset);
+
+  if (tid < warpSize) shm[tid] = 0;
+  __syncthreads();
+
+  if (tid % warpSize == 0) {
+    shm[tid / warpSize] = val;
+  }
+  __syncthreads();
+
+  CREATE_SHFL_MASK(mask, tid < warpSize);
+
+  if (tid < warpSize) {
+    val = shm[tid];
+    for (int offset = warpSize / 2; offset > 0; offset /= 2)
+      val += phi::backends::gpu::CudaShuffleDownSync(mask, val, offset);
+  }
+  return val;
+}
+
+}  // namespace gpu
+}  // namespace backends
+}  // namespace phi
diff --git a/paddle/phi/backends/gpu/musa/rocm_helper.h b/paddle/phi/backends/gpu/musa/rocm_helper.h
new file mode 100644
index 0000000000000..07fdde5a2f417
--- /dev/null
+++ b/paddle/phi/backends/gpu/musa/rocm_helper.h
@@ -0,0 +1,74 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace phi {
+namespace backends {
+namespace gpu {
+
+/*
+ * Summary: Grid stride looping macro in CUDA kernel
+ *
+ *  [ Why need this macro? ]
+ *
+ *    The original looping in CUDA kernel is:
+ *
+ *    `for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+ *        i += blockDim.x * gridDim.x)`
+ *
+ *    This for condition is risky. The value of `blockIdx.x * blockDim.x`
+ *    may be large, such as over 1GB, the first iteration is no problem here,
+ *    but when `i += blockDim.x * gridDim.x` is executed, the value of i
+ *    will greater than INT_MAX and overflow becomes negative value, at
+ *    this time, the cycle condition `i < (n)` is still satisfied, so it
+ *    will cause illegal access to cuda memory.
+ *
+ *    Here is a real example in ERINE, it will trigger above error.
+ *    The related data are:
+ *      - blockIdx.x = 2172938
+ *      - blockDim.x = 512
+ *      - blockIdx.x * blockDim.x = 1112543864
+ *      - INT_MAX = 2147483647
+ *
+ *    So we polish the for condition as follow, the int64_t __index__ will
+ *    prevent overflow in the loop increment.
+ *
+ * Parameters:
+ *    - i: loop index
+ *    - num: total element numbers
+ *
+ * Examples:
+ *    template <typename T>
+ *    __global__ void Scale(T* logit_grad, const T* loss_grad, const int num,
+ *                      const int d, const int remain) {
+ *    CUDA_KERNEL_LOOP(index, num) {
+ *      int idx_n = index / d;
+ *      int idx_remain = index % remain;
+ *      logit_grad[index] *= loss_grad[idx_n * remain + idx_remain];
+ *      }
+ *    }
+ *
+ */
+
+#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)                           \
+  int64_t __index__ =                                                       \
+      static_cast<int64_t>(hipBlockIdx_x) * hipBlockDim_x + hipThreadIdx_x; \
+  int64_t __stride__ = static_cast<int64_t>(hipBlockDim_x) * hipGridDim_x;  \
+  for (index_type i = __index__; __index__ < (num);                         \
+       __index__ += __stride__, i = __index__)
+
+}  // namespace gpu
+}  // namespace backends
+}  // namespace phi
diff --git a/paddle/phi/backends/gpu/musa/rocm_info.cc b/paddle/phi/backends/gpu/musa/rocm_info.cc
new file mode 100644
index 0000000000000..32c7c329253b1
--- /dev/null
+++ b/paddle/phi/backends/gpu/musa/rocm_info.cc
@@ -0,0 +1,334 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <array>
+
+#include "paddle/phi/backends/gpu/gpu_info.h"
+
+#include "paddle/phi/core/enforce.h"
+
+static std::once_flag g_device_props_size_init_flag;
+static std::vector<std::unique_ptr<std::once_flag>> g_device_props_init_flags;
+static std::vector<phi::gpuDeviceProp> g_device_props;
+
+namespace phi {
+namespace backends {
+namespace gpu {
+
+int DnnVersion() {
+  if (!dynload::HasCUDNN()) return -1;
+  size_t version_major, version_minor, version_patch;
+  PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenGetVersion(
+      &version_major, &version_minor, &version_patch));
+  return version_major * 100 + version_minor * 10 + version_patch;
+}
+
+static int GetGPUDeviceCountImpl() {
+  int driverVersion = 0;
+  musaError_t status = musaDriverGetVersion(&driverVersion);
+
+  if (!(status == gpuSuccess && driverVersion != 0)) {
+    // No GPU driver
+    VLOG(2) << "GPU Driver Version can't be detected. No GPU driver!";
+    return 0;
+  }
+
+  const auto *cuda_visible_devices = std::getenv("MUSA_VISIBLE_DEVICES");
+
+  if (cuda_visible_devices != nullptr) {
+    std::string cuda_visible_devices_str(cuda_visible_devices);
+    if (!cuda_visible_devices_str.empty()) {
+      cuda_visible_devices_str.erase(
+          0, cuda_visible_devices_str.find_first_not_of('\''));
+      cuda_visible_devices_str.erase(
+          cuda_visible_devices_str.find_last_not_of('\'') + 1);
+      cuda_visible_devices_str.erase(
+          0, cuda_visible_devices_str.find_first_not_of('\"'));
+      cuda_visible_devices_str.erase(
+          cuda_visible_devices_str.find_last_not_of('\"') + 1);
+    }
+    if (std::all_of(cuda_visible_devices_str.begin(),
+                    cuda_visible_devices_str.end(),
+                    [](char ch) { return ch == ' '; })) {
+      VLOG(2) << "MUSA_VISIBLE_DEVICES is set to be "
+                 "empty. No GPU detected.";
+      return 0;
+    }
+  }
+  int count;
+  PADDLE_ENFORCE_GPU_SUCCESS(musaGetDeviceCount(&count));
+  return count;
+}
+
+int GetGPUDeviceCount() {
+  // cache the count
+  static auto dev_cnt = GetGPUDeviceCountImpl();
+  return dev_cnt;
+}
+
+int GetGPUComputeCapability(int id) {
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
+  int major, minor;
+  auto major_error_code = musaDeviceGetAttribute(
+      &major, musaDeviceAttributeComputeCapabilityMajor, id);
+  auto minor_error_code = musaDeviceGetAttribute(
+      &minor, musaDeviceAttributeComputeCapabilityMinor, id);
+
+  PADDLE_ENFORCE_GPU_SUCCESS(major_error_code);
+  PADDLE_ENFORCE_GPU_SUCCESS(minor_error_code);
+  return major * 100 + minor;
+}
+
+int GetGPURuntimeVersion(int id) {
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
+  int runtime_version = 0;
+  PADDLE_ENFORCE_GPU_SUCCESS(musaRuntimeGetVersion(&runtime_version));
+  return runtime_version;
+}
+
+int GetGPUDriverVersion(int id) {
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
+  int driver_version = 0;
+  PADDLE_ENFORCE_GPU_SUCCESS(musaDriverGetVersion(&driver_version));
+  return driver_version;
+}
+
+bool TensorCoreAvailable() { return false; }
+
+int GetGPUMultiProcessors(int id) {
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
+  int count;
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      musaDeviceGetAttribute(&count, musaDeviceAttributeMultiprocessorCount, id));
+  return count;
+}
+
+int GetGPUMaxThreadsPerMultiProcessor(int id) {
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
+  int count;
+  PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceGetAttribute(
+      &count, musaDeviceAttributeMaxThreadsPerMultiProcessor, id));
+
+  return count;
+}
+
+int GetGPUMaxThreadsPerBlock(int id) {
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
+  int count;
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      musaDeviceGetAttribute(&count, musaDeviceAttributeMaxThreadsPerBlock, id));
+  return count;
+}
+
+int GetCurrentDeviceId() {
+  int device_id;
+  PADDLE_ENFORCE_GPU_SUCCESS(musaGetDevice(&device_id));
+  return device_id;
+}
+
+std::array<int, 3> GetGpuMaxGridDimSize(int id) {
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
+  std::array<int, 3> ret;
+  int size;
+  auto error_code_x =
+      musaDeviceGetAttribute(&size, musaDeviceAttributeMaxGridDimX, id);
+  PADDLE_ENFORCE_GPU_SUCCESS(error_code_x);
+  ret[0] = size;
+
+  auto error_code_y =
+      musaDeviceGetAttribute(&size, musaDeviceAttributeMaxGridDimY, id);
+  PADDLE_ENFORCE_GPU_SUCCESS(error_code_y);
+  ret[1] = size;
+
+  auto error_code_z =
+      musaDeviceGetAttribute(&size, musaDeviceAttributeMaxGridDimZ, id);
+  PADDLE_ENFORCE_GPU_SUCCESS(error_code_z);
+  ret[2] = size;
+  return ret;
+}
+
+std::pair<int, int> GetGpuStreamPriorityRange() {
+  int least_priority, greatest_priority;
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      musaDeviceGetStreamPriorityRange(&least_priority, &greatest_priority));
+  return std::make_pair(least_priority, greatest_priority);
+}
+
+const gpuDeviceProp &GetDeviceProperties(int id) {
+  std::call_once(g_device_props_size_init_flag, [&] {
+    int gpu_num = 0;
+    gpu_num = GetGPUDeviceCount();
+    g_device_props_init_flags.resize(gpu_num);
+    g_device_props.resize(gpu_num);
+    for (int i = 0; i < gpu_num; ++i) {
+      g_device_props_init_flags[i] = std::make_unique<std::once_flag>();
+    }
+  });
+
+  if (id == -1) {
+    id = GetCurrentDeviceId();
+  }
+
+  if (id < 0 || id >= static_cast<int>(g_device_props.size())) {
+    PADDLE_THROW(phi::errors::OutOfRange(
+        "The device id %d is out of range [0, %d), where %d is the number of "
+        "devices on this machine. Because the device id should be greater than "
+        "or equal to zero and smaller than the number of gpus. Please input "
+        "appropriate device again!",
+        id,
+        static_cast<int>(g_device_props.size()),
+        static_cast<int>(g_device_props.size())));
+  }
+
+  std::call_once(*(g_device_props_init_flags[id]), [&] {
+    PADDLE_ENFORCE_GPU_SUCCESS(musaGetDeviceProperties(&g_device_props[id], id));
+  });
+
+  return g_device_props[id];
+}
+
+void SetDeviceId(int id) {
+  // TODO(qijun): find a better way to cache the cuda device count
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
+  PADDLE_RETRY_CUDA_SUCCESS(musaSetDevice(id));
+}
+
+void GpuMemcpyAsync(void *dst,
+                    const void *src,
+                    size_t count,
+                    gpuMemcpyKind kind,
+                    gpuStream_t stream) {
+  PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpyAsync(dst, src, count, kind, stream));
+}
+
+void GpuMemcpySync(void *dst,
+                   const void *src,
+                   size_t count,
+                   gpuMemcpyKind kind) {
+  PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpy(dst, src, count, kind));
+}
+
+void GpuMemcpyPeerAsync(void *dst,
+                        int dst_device,
+                        const void *src,
+                        int src_device,
+                        size_t count,
+                        gpuStream_t stream) {
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      musaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream));
+}
+
+void GpuMemcpyPeerSync(
+    void *dst, int dst_device, const void *src, int src_device, size_t count) {
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      musaMemcpyPeer(dst, dst_device, src, src_device, count));
+}
+
+void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream) {
+  PADDLE_ENFORCE_GPU_SUCCESS(musaMemsetAsync(dst, value, count, stream));
+}
+
+void GpuStreamSync(gpuStream_t stream) {
+  PADDLE_ENFORCE_GPU_SUCCESS(musaStreamSynchronize(stream));
+}
+
+void GpuDestroyStream(gpuStream_t stream) {
+  PADDLE_ENFORCE_GPU_SUCCESS(musaStreamDestroy(stream));
+}
+
+void GpuDeviceSync() { PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize()); }
+
+gpuError_t GpuGetLastError() { return musaGetLastError(); }
+
+bool IsGPUManagedMemorySupported(int dev_id) {
+  PADDLE_ENFORCE_LT(
+      dev_id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   dev_id,
+                                   GetGPUDeviceCount()));
+  // TODO(qili93): Hygon DTK (21.04 and 22.04) not support
+  // musaDeviceAttributeManagedMemory, temporary disable by default, to be
+  // verified in next DTK release
+  return false;
+}
+
+bool IsGPUManagedMemoryOversubscriptionSupported(int dev_id) {
+  PADDLE_ENFORCE_LT(
+      dev_id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   dev_id,
+                                   GetGPUDeviceCount()));
+#ifdef __linux__
+  return IsGPUManagedMemorySupported(dev_id) &&
+         GetGPUComputeCapability(dev_id) >= 60;
+#else
+  return false;
+#endif
+}
+
+}  // namespace gpu
+}  // namespace backends
+}  // namespace phi

From c34f22d0ee4b7774c6f343f723c910583d4bf88f Mon Sep 17 00:00:00 2001
From: Xiaokang Shang <xiaokang.shang@mthreads.com>
Date: Tue, 25 Jul 2023 17:13:23 +0800
Subject: [PATCH 04/55] remove paddle/phi/backends/gpu/musa files

---
 paddle/phi/backends/gpu/musa/miopen_desc.h    | 264 --------
 paddle/phi/backends/gpu/musa/miopen_helper.h  | 595 ------------------
 paddle/phi/backends/gpu/musa/musa_info.cc     |  32 +-
 .../backends/gpu/musa/rocm_device_function.h  | 165 -----
 paddle/phi/backends/gpu/musa/rocm_helper.h    |  74 ---
 paddle/phi/backends/gpu/musa/rocm_info.cc     | 334 ----------
 6 files changed, 16 insertions(+), 1448 deletions(-)
 delete mode 100644 paddle/phi/backends/gpu/musa/miopen_desc.h
 delete mode 100644 paddle/phi/backends/gpu/musa/miopen_helper.h
 delete mode 100644 paddle/phi/backends/gpu/musa/rocm_device_function.h
 delete mode 100644 paddle/phi/backends/gpu/musa/rocm_helper.h
 delete mode 100644 paddle/phi/backends/gpu/musa/rocm_info.cc

diff --git a/paddle/phi/backends/gpu/musa/miopen_desc.h b/paddle/phi/backends/gpu/musa/miopen_desc.h
deleted file mode 100644
index ae0e274ca650e..0000000000000
--- a/paddle/phi/backends/gpu/musa/miopen_desc.h
+++ /dev/null
@@ -1,264 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <algorithm>
-#include <functional>
-#include <iostream>
-#include <iterator>
-#include <memory>
-#include <numeric>
-#include <string>
-#include <vector>
-
-#include "paddle/phi/backends/gpu/rocm/miopen_helper.h"
-#include "paddle/phi/core/utils/data_type.h"
-
-namespace phi {
-namespace backends {
-namespace gpu {
-
-inline std::vector<int> TransformDimOrder(const std::vector<int>& dims) {
-  std::vector<int> transformed_dims(dims.begin(), dims.end());
-  int H, W, D, C;
-  if (dims.size() == 4) {
-    H = dims[1];
-    W = dims[2];
-    C = dims[3];
-    transformed_dims[1] = C;
-    transformed_dims[2] = H;
-    transformed_dims[3] = W;
-  } else {
-    D = dims[1];
-    H = dims[2];
-    W = dims[3];
-    C = dims[4];
-    transformed_dims[1] = C;
-    transformed_dims[2] = D;
-    transformed_dims[3] = H;
-    transformed_dims[4] = W;
-  }
-  return transformed_dims;
-}
-
-inline miopenDataType_t ToCudnnDataType(const phi::DataType& t) {
-  miopenDataType_t type = miopenFloat;
-  switch (t) {
-    case phi::DataType::FLOAT16:
-      type = miopenHalf;
-      break;
-    case phi::DataType::FLOAT32:
-      type = miopenFloat;
-      break;
-    default:
-      break;
-  }
-  return type;
-}
-
-class ActivationDescriptor {
- public:
-  using T = miopenActivationDescriptor;
-  struct Deleter {
-    void operator()(T* t) {
-      if (t != nullptr) {
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::miopenDestroyActivationDescriptor(t));
-        t = nullptr;
-      }
-    }
-  };
-  ActivationDescriptor() {
-    T* raw_ptr;
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenCreateActivationDescriptor(&raw_ptr));
-    desc_.reset(raw_ptr);
-  }
-  template <typename T>
-  void set(miopenActivationMode_t mode, const T& coef) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetActivationDescriptor(
-        desc_.get(), mode, static_cast<double>(coef), 0.0, 0.0));
-  }
-
-  T* desc() { return desc_.get(); }
-  T* desc() const { return desc_.get(); }
-
- private:
-  std::unique_ptr<T, Deleter> desc_;
-};
-
-class TensorDescriptor {
- public:
-  using T = miopenTensorDescriptor;
-  struct Deleter {
-    void operator()(T* t) {
-      if (t != nullptr) {
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::miopenDestroyTensorDescriptor(t));
-        t = nullptr;
-      }
-    }
-  };
-  TensorDescriptor() {
-    T* raw_ptr;
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenCreateTensorDescriptor(&raw_ptr));
-    desc_.reset(raw_ptr);
-  }
-  T* desc() { return desc_.get(); }
-  T* desc() const { return desc_.get(); }
-
-  void set(const phi::DenseTensor& tensor, const int groups = 1) {
-    auto dims = phi::vectorize<int>(tensor.dims());
-    std::vector<int> strides(dims.size());
-    strides[dims.size() - 1] = 1;
-    for (int i = dims.size() - 2; i >= 0; i--) {
-      strides[i] = dims[i + 1] * strides[i + 1];
-    }
-    std::vector<int> dims_with_group(dims.begin(), dims.end());
-    if (groups > 1) {
-      dims_with_group[1] = dims_with_group[1] / groups;
-    }
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor(
-        (miopenTensorDescriptor_t)(desc_.get()),
-        ToCudnnDataType(tensor.dtype()),
-        static_cast<int>(dims_with_group.size()),
-        const_cast<int*>(dims_with_group.data()),
-        const_cast<int*>(strides.data())));
-  }
-
-  void set(const phi::DenseTensor& tensor, const miopenTensorFormat_t format) {
-    const int groups = 1;
-    PADDLE_ENFORCE_EQ(
-        format,
-        MIOPEN_TENSOR_NCHW,
-        phi::errors::InvalidArgument("format should ONLY be NCHW in MIOPEN."));
-    auto dims = phi::vectorize<int>(tensor.dims());
-    std::vector<int> strides(dims.size());
-    strides[dims.size() - 1] = 1;
-    for (int i = dims.size() - 2; i >= 0; i--) {
-      strides[i] = dims[i + 1] * strides[i + 1];
-    }
-    std::vector<int> dims_with_group(dims.begin(), dims.end());
-    if (groups > 1) {
-      dims_with_group[1] = dims_with_group[1] / groups;
-    }
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor(
-        (miopenTensorDescriptor_t)(desc_.get()),
-        ToCudnnDataType(tensor.dtype()),
-        static_cast<int>(dims_with_group.size()),
-        const_cast<int*>(dims_with_group.data()),
-        const_cast<int*>(strides.data())));
-  }
-
- private:
-  std::unique_ptr<T, Deleter> desc_;
-};
-
-class FilterDescriptor {
- public:
-  using T = miopenTensorDescriptor;
-  struct Deleter {
-    void operator()(T* t) {
-      if (t != nullptr) {
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::miopenDestroyTensorDescriptor(t));
-        t = nullptr;
-      }
-    }
-  };
-  FilterDescriptor() {
-    T* raw_ptr;
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenCreateTensorDescriptor(&raw_ptr));
-    desc_.reset(raw_ptr);
-  }
-  T* desc() { return desc_.get(); }
-  T* desc() const { return desc_.get(); }
-
-  void set(const phi::DenseTensor& tensor,
-           const miopenTensorFormat_t format,
-           const int groups = 1) {
-    PADDLE_ENFORCE_EQ(
-        format,
-        MIOPEN_TENSOR_NCHW,
-        phi::errors::InvalidArgument("format should ONLY be NCHW in MIOPEN."));
-    auto dims = phi::vectorize<int>(tensor.dims());
-    std::vector<int> strides(dims.size());
-    strides[dims.size() - 1] = 1;
-    for (int i = dims.size() - 2; i >= 0; i--) {
-      strides[i] = dims[i + 1] * strides[i + 1];
-    }
-    std::vector<int> dims_with_group(dims.begin(), dims.end());
-    if (groups > 1) {
-      dims_with_group[1] = dims_with_group[1] / groups;
-    }
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor(
-        (miopenTensorDescriptor_t)(desc_.get()),
-        ToCudnnDataType(tensor.dtype()),
-        static_cast<int>(dims_with_group.size()),
-        const_cast<int*>(dims_with_group.data()),
-        const_cast<int*>(strides.data())));
-  }
-
- private:
-  std::unique_ptr<T, Deleter> desc_;
-};
-
-class ConvolutionDescriptor {
- public:
-  using T = miopenConvolutionDescriptor;
-  struct Deleter {
-    void operator()(T* t) {
-      if (t != nullptr) {
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::miopenDestroyConvolutionDescriptor(t));
-        t = nullptr;
-      }
-    }
-  };
-  ConvolutionDescriptor() {
-    T* raw_ptr;
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenCreateConvolutionDescriptor(&raw_ptr));
-    desc_.reset(raw_ptr);
-  }
-  T* desc() { return desc_.get(); }
-  T* desc() const { return desc_.get(); }
-
-  void set(miopenDataType_t dtype,
-           const std::vector<int>& pads,
-           const std::vector<int>& strides,
-           const std::vector<int>& dilations,
-           bool allow_tf32,
-           const int groups = 1) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenInitConvolutionNdDescriptor(
-        (miopenConvolutionDescriptor_t)desc_.get(),
-        static_cast<int>(pads.size()),
-        const_cast<int*>(pads.data()),
-        const_cast<int*>(strides.data()),
-        const_cast<int*>(dilations.data()),
-        miopenConvolution));
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetConvolutionGroupCount(
-        (miopenConvolutionDescriptor_t)desc_.get(), groups));
-  }
-
- private:
-  std::unique_ptr<T, Deleter> desc_;
-};
-
-}  // namespace gpu
-}  // namespace backends
-}  // namespace phi
diff --git a/paddle/phi/backends/gpu/musa/miopen_helper.h b/paddle/phi/backends/gpu/musa/miopen_helper.h
deleted file mode 100644
index 095f32ba460d0..0000000000000
--- a/paddle/phi/backends/gpu/musa/miopen_helper.h
+++ /dev/null
@@ -1,595 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "gflags/gflags.h"
-
-#include "paddle/phi/backends/dynload/miopen.h"
-#include "paddle/phi/common/bfloat16.h"
-#include "paddle/phi/common/float16.h"
-#include "paddle/phi/common/place.h"
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
-#include "paddle/phi/core/macros.h"
-
-// MIOPEN do not have epslion definition
-#define CUDNN_BN_MIN_EPSILON 1e-05
-
-DECLARE_bool(cudnn_deterministic);
-
-namespace phi {
-namespace backends {
-namespace gpu {
-
-inline const char* miopenGetErrorString(miopenStatus_t status) {
-  switch (status) {
-    case miopenStatusSuccess:
-      return "miopenStatusSuccess";
-    case miopenStatusNotInitialized:
-      return "miopenStatusNotInitialized";
-    case miopenStatusAllocFailed:
-      return "miopenStatusAllocFailed";
-    case miopenStatusBadParm:
-      return "miopenStatusBadParm";
-    case miopenStatusInternalError:
-      return "miopenStatusInternalError";
-    case miopenStatusInvalidValue:
-      return "miopenStatusInvalidValue";
-    case miopenStatusUnknownError:
-      return "miopenStatusUnknownError";
-    case miopenStatusNotImplemented:
-      return "miopenStatusNotImplemented";
-    default:
-      return "Unknown miopen error number";
-  }
-}
-
-// no use, but will have compiling error if not defined
-#define CUDNN_VERSION_MIN(major, minor, patch) \
-  (CUDNN_VERSION >= ((major)*1000 + (minor)*100 + (patch)))
-
-enum class DataLayout {  // Not use
-  kNHWC,
-  kNCHW,
-  kNCDHW,
-  kNDHWC,  // add, liyamei
-  kNCHW_VECT_C,
-};
-
-enum class PoolingMode {
-  kMaximum,
-  kMaximumDeterministic,
-  kAverageExclusive,
-  kAverageInclusive,
-};
-
-enum class ActivationMode {
-  kNone,  // activation identity
-  kSigmoid,
-  kRelu,
-  kRelu6,
-  kReluX,
-  kTanh,
-  kBandPass,
-};
-
-inline miopenPoolingMode_t GetPoolingMode(const PoolingMode& mode) {
-  switch (mode) {
-    case PoolingMode::kMaximumDeterministic:
-      return miopenPoolingMax;
-    case PoolingMode::kAverageExclusive:
-      return miopenPoolingAverage;
-    case PoolingMode::kAverageInclusive:
-      return miopenPoolingAverageInclusive;
-    case PoolingMode::kMaximum:
-      return miopenPoolingMax;
-    default:
-      PADDLE_THROW(
-          phi::errors::Unimplemented("Unexpected MIOPEN pooling mode."));
-  }
-}
-
-inline ActivationMode StringToActivationMode(const std::string& str) {
-  if (str == "identity") {
-    return ActivationMode::kNone;
-  } else if (str == "sigmoid") {
-    return ActivationMode::kSigmoid;
-  } else if (str == "relu") {
-    return ActivationMode::kRelu;
-  } else if (str == "relu6") {
-    return ActivationMode::kRelu6;
-  } else if (str == "relux") {
-    return ActivationMode::kReluX;
-  } else if (str == "tanh") {
-    return ActivationMode::kTanh;
-  } else if (str == "bandpass") {
-    return ActivationMode::kBandPass;
-  } else {
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "Unknown MIOPEN activation string: %s.", str));
-  }
-}
-
-template <typename T>
-class CudnnDataType;
-
-template <>
-class CudnnDataType<phi::dtype::float16> {
- public:
-  static const miopenDataType_t type = miopenHalf;
-  // The scaling param type is float for HALF and FLOAT tensors
-  using ScalingParamType = const float;
-  using BatchNormParamType = float;
-  static ScalingParamType* kOne() {
-    static ScalingParamType v = 1.0;
-    return &v;
-  }
-  static ScalingParamType* kZero() {
-    static ScalingParamType v = 0.0;
-    return &v;
-  }
-};
-
-template <>
-class CudnnDataType<phi::dtype::bfloat16> {
- public:
-  static const miopenDataType_t type = miopenBFloat16;
-  // The scaling param type is float for HALF and FLOAT tensors
-  using ScalingParamType = const float;
-  using BatchNormParamType = float;
-  static ScalingParamType* kOne() {
-    static ScalingParamType v = 1.0;
-    return &v;
-  }
-  static ScalingParamType* kZero() {
-    static ScalingParamType v = 0.0;
-    return &v;
-  }
-};
-
-template <>
-class CudnnDataType<float> {
- public:
-  static const miopenDataType_t type = miopenFloat;
-  using ScalingParamType = const float;
-  using BatchNormParamType = float;
-  static ScalingParamType* kOne() {
-    static ScalingParamType v = 1.0;
-    return &v;
-  }
-  static ScalingParamType* kZero() {
-    static ScalingParamType v = 0.0;
-    return &v;
-  }
-};
-
-inline miopenTensorFormat_t GetCudnnTensorFormat(const DataLayout& order) {
-  switch (order) {
-    case DataLayout::kNHWC:
-      return MIOPEN_TENSOR_NHWC;
-    case DataLayout::kNCHW:
-      return MIOPEN_TENSOR_NCHW;
-    case DataLayout::kNCDHW:
-      return MIOPEN_TENSOR_NCHW;
-    case DataLayout::kNDHWC:
-      return MIOPEN_TENSOR_NHWC;
-    default:
-      PADDLE_THROW(phi::errors::Unimplemented(
-          "MIOPEN has no equivalent dataLayout for input order."));
-  }
-  return MIOPEN_TENSOR_NCHW;
-}
-
-class ScopedTensorDescriptor {
- public:
-  ScopedTensorDescriptor() {
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenCreateTensorDescriptor(&desc_));
-  }
-  ~ScopedTensorDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenDestroyTensorDescriptor(desc_));
-  }
-
-  inline miopenTensorDescriptor_t descriptor(const miopenTensorFormat_t format,
-                                             const miopenDataType_t type,
-                                             const std::vector<int>& dims,
-                                             const int groups = 1) {
-    // the format is not used now, will add later
-    std::vector<int> strides(dims.size());
-    strides[dims.size() - 1] = 1;
-    for (int i = dims.size() - 2; i >= 0; i--) {
-      strides[i] = dims[i + 1] * strides[i + 1];
-    }
-    // Update tensor descriptor dims setting if groups > 1
-    // NOTE: Here, Assume using NCHW or NCDHW order
-    std::vector<int> dims_with_group(dims.begin(), dims.end());
-    if (groups > 1) {
-      dims_with_group[1] = dims_with_group[1] / groups;
-    }
-
-    // MIOPEN ONLY support data layout of NCHW
-    PADDLE_ENFORCE_EQ(
-        format,
-        MIOPEN_TENSOR_NCHW,
-        phi::errors::InvalidArgument("format should ONLY be NCHW in MIOPEN."));
-    if (dims.size() == 4) {
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor(
-          desc_,
-          type,
-          dims_with_group.size(),
-          const_cast<int*>(dims_with_group.data()),
-          const_cast<int*>(strides.data())));
-    } else if (dims.size() == 5) {
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor(
-          desc_,
-          type,
-          dims_with_group.size(),
-          const_cast<int*>(dims_with_group.data()),
-          const_cast<int*>(strides.data())));
-    }
-    return desc_;
-  }
-
-  template <typename T>
-  inline miopenTensorDescriptor_t descriptor(const DataLayout& order,
-                                             const std::vector<int>& dims,
-                                             const int groups = 1) {
-    return descriptor(
-        GetCudnnTensorFormat(order), CudnnDataType<T>::type, dims, groups);
-  }
-
-  inline miopenTensorDescriptor_t descriptor(const miopenDataType_t miopen_type,
-                                             const std::vector<int>& dim,
-                                             const std::vector<int>& stride) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor(
-        desc_,
-        miopen_type,
-        dim.size(),
-        const_cast<int*>(dim.data()),
-        const_cast<int*>(stride.data())));
-    return desc_;
-  }
-
-  template <typename T>
-  inline miopenTensorDescriptor_t descriptor(const std::vector<int>& dim,
-                                             const std::vector<int>& stride) {
-    return descriptor(CudnnDataType<T>::type, dim, stride);
-  }
-
-  inline miopenTensorDescriptor_t desc() { return desc_; }
-
- private:
-  miopenTensorDescriptor_t desc_;
-  DISABLE_COPY_AND_ASSIGN(ScopedTensorDescriptor);
-};
-
-class ScopedDropoutDescriptor {
- public:
-  ScopedDropoutDescriptor() {
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenCreateDropoutDescriptor(&desc_));
-  }
-  ~ScopedDropoutDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenDestroyDropoutDescriptor(desc_));
-  }
-
-  inline miopenDropoutDescriptor_t descriptor(const miopenHandle_t& handle,
-                                              const phi::Place& place,
-                                              bool initialized,
-                                              float dropout_prob_,
-                                              phi::DenseTensor* dropout_state_,
-                                              int seed,
-                                              size_t state_size) {
-    if (dropout_state_ == nullptr) {  // for no dropout or test
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::miopenSetDropoutDescriptor(desc_,
-                                                   handle,
-                                                   0 /* dropout */,
-                                                   nullptr,
-                                                   0 /* state_size */,
-                                                   0 /* seed */,
-                                                   false,
-                                                   false,
-                                                   MIOPEN_RNG_PSEUDO_XORWOW));
-      return desc_;
-    }
-    auto* dropout_state_data = dropout_state_->data<uint8_t>();
-    if (!initialized) {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::miopenSetDropoutDescriptor(desc_,
-                                                   handle,
-                                                   dropout_prob_,
-                                                   dropout_state_data,
-                                                   state_size,
-                                                   seed,
-                                                   false,
-                                                   false,
-                                                   MIOPEN_RNG_PSEUDO_XORWOW));
-    } else {
-      auto dropout_state_dims = dropout_state_->dims();
-      state_size = dropout_state_dims[0];
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenRestoreDropoutDescriptor(
-          desc_,
-          handle,
-          dropout_prob_,
-          dropout_state_data,
-          state_size,
-          0,
-          false,
-          false,
-          MIOPEN_RNG_PSEUDO_XORWOW));
-    }
-    return desc_;
-  }
-  inline miopenDropoutDescriptor_t desc() { return desc_; }
-
- private:
-  miopenDropoutDescriptor_t desc_;
-  DISABLE_COPY_AND_ASSIGN(ScopedDropoutDescriptor);
-};
-
-class ScopedRNNDescriptor {
- public:
-  ScopedRNNDescriptor() {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenCreateRNNDescriptor(&desc_));
-  }
-  ~ScopedRNNDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDestroyRNNDescriptor(desc_));
-  }
-
-  inline miopenRNNDescriptor_t desc() { return desc_; }
-
- private:
-  miopenRNNDescriptor_t desc_;
-  DISABLE_COPY_AND_ASSIGN(ScopedRNNDescriptor);
-};
-
-class ScopedFilterDescriptor {
- public:
-  ScopedFilterDescriptor() {
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenCreateTensorDescriptor(&desc_));
-  }
-  ~ScopedFilterDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenDestroyTensorDescriptor(desc_));
-  }
-
-  inline miopenTensorDescriptor_t descriptor(const miopenTensorFormat_t format,
-                                             const miopenDataType_t type,
-                                             const std::vector<int>& kernel,
-                                             const int groups = 1) {
-    // filter layout: MCHW(MCDHW), where M is the number of
-    // output image channels, C is the number of input image channels,
-    // D is the depth of the filter, H is the height of the filter, and W is the
-    // width of the filter.
-    std::vector<int> kernel_with_group(kernel.begin(), kernel.end());
-    if (groups > 1) {
-      kernel_with_group[0] /= groups;
-      // NOTE: input filter(C) of the filter is already asserted to be C/groups.
-    }
-    std::vector<int> stride_dim(kernel_with_group.size());
-    stride_dim.push_back(1);
-    for (int k = kernel_with_group.size() - 2; k >= 0; k--) {
-      stride_dim[k] = stride_dim[k + 1] * kernel_with_group[k + 1];
-    }
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor(
-        desc_,
-        type,
-        kernel_with_group.size(),
-        const_cast<int*>(kernel_with_group.data()),
-        const_cast<int*>(stride_dim.data())));
-    return desc_;
-  }
-
-  template <typename T>
-  inline miopenTensorDescriptor_t descriptor(const DataLayout& order,
-                                             const std::vector<int>& kernel,
-                                             const int groups = 1) {
-    return descriptor(
-        GetCudnnTensorFormat(order), CudnnDataType<T>::type, kernel, groups);
-  }
-
-  inline miopenTensorDescriptor_t desc() { return desc_; }
-
- private:
-  miopenTensorDescriptor_t desc_;
-  DISABLE_COPY_AND_ASSIGN(ScopedFilterDescriptor);
-};
-
-class ScopedConvolutionDescriptor {
- public:
-  ScopedConvolutionDescriptor() {
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenCreateConvolutionDescriptor(&desc_));
-  }
-  ~ScopedConvolutionDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenDestroyConvolutionDescriptor(desc_));
-  }
-
-  inline miopenConvolutionDescriptor_t descriptor(
-      miopenDataType_t type,
-      const std::vector<int>& pads,
-      const std::vector<int>& strides,
-      const std::vector<int>& dilations) {
-    PADDLE_ENFORCE_EQ(pads.size(),
-                      strides.size(),
-                      phi::errors::InvalidArgument(
-                          "The size of pads and strides should be equal. But "
-                          "received size of pads is %d, size of strides is %d.",
-                          pads.size(),
-                          strides.size()));
-    PADDLE_ENFORCE_EQ(
-        pads.size(),
-        dilations.size(),
-        phi::errors::InvalidArgument(
-            "The size of pads and dilations should be equal. But received size "
-            "of pads is %d, size of dilations is %d.",
-            pads.size(),
-            dilations.size()));
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenInitConvolutionNdDescriptor(
-        desc_,
-        pads.size(),
-        const_cast<int*>(pads.data()),
-        const_cast<int*>(strides.data()),
-        const_cast<int*>(dilations.data()),
-        miopenConvolution));
-    return desc_;
-  }
-
-  template <typename T>
-  inline miopenConvolutionDescriptor_t descriptor(
-      const std::vector<int>& pads,
-      const std::vector<int>& strides,
-      const std::vector<int>& dilations) {
-    return descriptor(CudnnDataType<T>::type, pads, strides, dilations);
-  }
-
- private:
-  miopenConvolutionDescriptor_t desc_;
-  DISABLE_COPY_AND_ASSIGN(ScopedConvolutionDescriptor);
-};
-
-class ScopedPoolingDescriptor {
- public:
-  ScopedPoolingDescriptor() {
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenCreatePoolingDescriptor(&desc_));
-  }
-  ~ScopedPoolingDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenDestroyPoolingDescriptor(desc_));
-  }
-
-  inline miopenPoolingDescriptor_t descriptor(const PoolingMode& mode,
-                                              const std::vector<int>& kernel,
-                                              const std::vector<int>& pads,
-                                              const std::vector<int>& strides) {
-    PADDLE_ENFORCE_EQ(kernel.size(),
-                      pads.size(),
-                      phi::errors::InvalidArgument(
-                          "The size of kernel and pads should be equal. But "
-                          "received size of kernel is %d, size of pads is %d.",
-                          kernel.size(),
-                          pads.size()));
-    PADDLE_ENFORCE_EQ(
-        kernel.size(),
-        strides.size(),
-        phi::errors::InvalidArgument(
-            "The size of kernel and strides should be equal. But "
-            "received size of kernel is %d, size of strides is %d.",
-            kernel.size(),
-            strides.size()));
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetNdPoolingDescriptor(
-        desc_,
-        GetPoolingMode(mode),
-        kernel.size(),
-        const_cast<int*>(kernel.data()),
-        const_cast<int*>(pads.data()),
-        const_cast<int*>(strides.data())));
-    return desc_;
-  }
-
- private:
-  miopenPoolingDescriptor_t desc_;
-  DISABLE_COPY_AND_ASSIGN(ScopedPoolingDescriptor);
-};
-
-class ScopedActivationDescriptor {
- public:
-  ScopedActivationDescriptor() {
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenCreateActivationDescriptor(&desc_));
-  }
-  ~ScopedActivationDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenDestroyActivationDescriptor(desc_));
-  }
-
-  template <typename T>
-  inline miopenActivationDescriptor_t descriptor(
-      const std::string& act, double value_max = static_cast<double>(0.)) {
-    double relu_ceiling = 0.0;
-    ActivationMode activation_mode = StringToActivationMode(act);
-    miopenActivationMode_t mode;
-    switch (activation_mode) {
-      case ActivationMode::kNone:
-        mode = miopenActivationPASTHRU;
-        break;
-      case ActivationMode::kRelu6:
-        relu_ceiling = 6.0;
-        mode = miopenActivationCLIPPEDRELU;
-        break;
-      case ActivationMode::kReluX:
-        relu_ceiling = value_max;
-        mode = miopenActivationCLIPPEDRELU;
-        break;
-      case ActivationMode::kRelu:
-        mode = miopenActivationRELU;
-        break;
-      case ActivationMode::kSigmoid:
-        mode = miopenActivationLOGISTIC;
-        break;
-      case ActivationMode::kTanh:
-        mode = miopenActivationTANH;
-        break;
-      default:
-        PADDLE_THROW(phi::errors::Unimplemented(
-            "Unrecognized MIOPEN activation mode: %d.",
-            static_cast<int>(activation_mode)));
-    }
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetActivationDescriptor(
-        desc_, mode, relu_ceiling, 0.0, 0.0));
-    return desc_;
-  }
-
- private:
-  miopenActivationDescriptor_t desc_;
-  DISABLE_COPY_AND_ASSIGN(ScopedActivationDescriptor);
-};
-
-class ScopedCTCLossDescriptor {
- public:
-  ScopedCTCLossDescriptor() {
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenCreateCTCLossDescriptor(&desc_));
-  }
-  ~ScopedCTCLossDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenDestroyCTCLossDescriptor(desc_));
-  }
-
-  template <typename T>
-  inline miopenCTCLossDescriptor_t descriptor() {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetCTCLossDescriptor(
-        desc_, CudnnDataType<T>::type, 0, false));
-    return desc_;
-  }
-
- private:
-  miopenCTCLossDescriptor_t desc_;
-  DISABLE_COPY_AND_ASSIGN(ScopedCTCLossDescriptor);
-};
-
-}  // namespace gpu
-}  // namespace backends
-}  // namespace phi
diff --git a/paddle/phi/backends/gpu/musa/musa_info.cc b/paddle/phi/backends/gpu/musa/musa_info.cc
index 6579ce63f21f6..f2087e4d7f4fc 100644
--- a/paddle/phi/backends/gpu/musa/musa_info.cc
+++ b/paddle/phi/backends/gpu/musa/musa_info.cc
@@ -88,16 +88,15 @@ int GetGPUComputeCapability(int id) {
                                    "but received id is: %d. GPU count is: %d.",
                                    id,
                                    GetGPUDeviceCount()));
-  return 100;
-  //int major, minor;
-  //auto major_error_code = musaDeviceGetAttribute(
-  //    &major, musaDeviceAttributeComputeCapabilityMajor, id);
-  //auto minor_error_code = musaDeviceGetAttribute(
-  //    &minor, musaDeviceAttributeComputeCapabilityMinor, id);
-
-  //PADDLE_ENFORCE_GPU_SUCCESS(major_error_code);
-  //PADDLE_ENFORCE_GPU_SUCCESS(minor_error_code);
-  //return major * 100 + minor;
+  int major, minor;
+  auto major_error_code = musaDeviceGetAttribute(
+      &major, musaDevAttrComputeCapabilityMajor, id);
+  auto minor_error_code = musaDeviceGetAttribute(
+      &minor, musaDevAttrComputeCapabilityMinor, id);
+
+  PADDLE_ENFORCE_GPU_SUCCESS(major_error_code);
+  PADDLE_ENFORCE_GPU_SUCCESS(minor_error_code);
+  return major * 100 + minor;
 }
 
 int GetGPURuntimeVersion(int id) {
@@ -138,7 +137,8 @@ int GetGPUMultiProcessors(int id) {
                                    GetGPUDeviceCount()));
   int count;
   PADDLE_ENFORCE_GPU_SUCCESS(
-      musaDeviceGetAttribute(&count, musaDeviceAttributeMultiprocessorCount, id));
+                                     
+      musaDeviceGetAttribute(&count, musaDevAttrMultiProcessorCount, id));
   return count;
 }
 
@@ -152,7 +152,7 @@ int GetGPUMaxThreadsPerMultiProcessor(int id) {
                                    GetGPUDeviceCount()));
   int count;
   PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceGetAttribute(
-      &count, musaDeviceAttributeMaxThreadsPerMultiProcessor, id));
+      &count, musaDevAttrMaxThreadsPerMultiProcessor, id));
 
   return count;
 }
@@ -167,7 +167,7 @@ int GetGPUMaxThreadsPerBlock(int id) {
                                    GetGPUDeviceCount()));
   int count;
   PADDLE_ENFORCE_GPU_SUCCESS(
-      musaDeviceGetAttribute(&count, musaDeviceAttributeMaxThreadsPerBlock, id));
+      musaDeviceGetAttribute(&count, musaDevAttrMaxThreadsPerBlock, id));
   return count;
 }
 
@@ -188,17 +188,17 @@ std::array<int, 3> GetGpuMaxGridDimSize(int id) {
   std::array<int, 3> ret;
   int size;
   auto error_code_x =
-      musaDeviceGetAttribute(&size, musaDeviceAttributeMaxGridDimX, id);
+      musaDeviceGetAttribute(&size, musaDevAttrMaxGridDimX, id);
   PADDLE_ENFORCE_GPU_SUCCESS(error_code_x);
   ret[0] = size;
 
   auto error_code_y =
-      musaDeviceGetAttribute(&size, musaDeviceAttributeMaxGridDimY, id);
+      musaDeviceGetAttribute(&size, musaDevAttrMaxGridDimY, id);
   PADDLE_ENFORCE_GPU_SUCCESS(error_code_y);
   ret[1] = size;
 
   auto error_code_z =
-      musaDeviceGetAttribute(&size, musaDeviceAttributeMaxGridDimZ, id);
+      musaDeviceGetAttribute(&size, musaDevAttrMaxGridDimZ, id);
   PADDLE_ENFORCE_GPU_SUCCESS(error_code_z);
   ret[2] = size;
   return ret;
diff --git a/paddle/phi/backends/gpu/musa/rocm_device_function.h b/paddle/phi/backends/gpu/musa/rocm_device_function.h
deleted file mode 100644
index 6f5d684075f0f..0000000000000
--- a/paddle/phi/backends/gpu/musa/rocm_device_function.h
+++ /dev/null
@@ -1,165 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-// NOTE(): support float16 to half in header file.
-#define PADDLE_CUDA_FP16
-#include "paddle/phi/common/bfloat16.h"
-#include "paddle/phi/common/complex.h"
-#include "paddle/phi/common/float16.h"
-
-namespace phi {
-namespace backends {
-namespace gpu {
-
-#define CREATE_SHFL_MASK(mask, predicate) mask = __ballot((predicate))
-
-#define CUDA_LAUNCH_KERNEL_BASE(dim, ...)  \
-  case (dim): {                            \
-    constexpr auto kPowerOfTwoDim = (dim); \
-    __VA_ARGS__;                           \
-  } break
-
-#define CUDA_LAUNCH_KERNEL_HELPER(...)          \
-  CUDA_LAUNCH_KERNEL_BASE(1024, ##__VA_ARGS__); \
-  CUDA_LAUNCH_KERNEL_BASE(512, ##__VA_ARGS__);  \
-  CUDA_LAUNCH_KERNEL_BASE(256, ##__VA_ARGS__);  \
-  CUDA_LAUNCH_KERNEL_BASE(128, ##__VA_ARGS__);  \
-  CUDA_LAUNCH_KERNEL_BASE(64, ##__VA_ARGS__);   \
-  CUDA_LAUNCH_KERNEL_BASE(32, ##__VA_ARGS__);
-
-template <typename T>
-__forceinline__ __device__ T
-CudaShuffleDownSync(unsigned mask, T val, int delta, int width = warpSize) {
-  return __shfl_down(val, delta, width);
-}
-
-template <typename T>
-__forceinline__ __device__ T CudaShuffleXorSync(unsigned mask,
-                                                T val,
-                                                int width = warpSize) {
-  return __shfl_xor(val, width);
-}
-
-template <>
-__forceinline__ __device__ phi::dtype::float16 CudaShuffleDownSync(
-    unsigned mask, phi::dtype::float16 val, int delta, int width) {
-  return phi::dtype::float16(__shfl_down(
-      static_cast<float>(val), static_cast<unsigned>(delta), width));
-}
-
-template <>
-__forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleDownSync(
-    unsigned mask, phi::dtype::bfloat16 val, int delta, int width) {
-  return phi::dtype::bfloat16(__shfl_down(
-      static_cast<float>(val), static_cast<unsigned>(delta), width));
-}
-
-template <>
-__forceinline__ __device__ phi::dtype::complex<float> CudaShuffleDownSync(
-    unsigned mask, phi::dtype::complex<float> val, int delta, int width) {
-  float real = __shfl_down(val.real, delta, width);
-  float imag = __shfl_down(val.imag, delta, width);
-  return phi::dtype::complex<float>(real, imag);
-}
-
-template <>
-__forceinline__ __device__ phi::dtype::complex<double> CudaShuffleDownSync(
-    unsigned mask, phi::dtype::complex<double> val, int delta, int width) {
-  double real = __shfl_down(val.real, delta, width);
-  double imag = __shfl_down(val.imag, delta, width);
-  return phi::dtype::complex<double>(real, imag);
-}
-
-template <>
-__forceinline__ __device__ phi::dtype::float16 CudaShuffleXorSync(
-    unsigned mask, phi::dtype::float16 val, int width) {
-  return phi::dtype::float16(__shfl_xor(static_cast<float>(val), width));
-}
-
-template <>
-__forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleXorSync(
-    unsigned mask, phi::dtype::bfloat16 val, int width) {
-  return phi::dtype::bfloat16(__shfl_xor(static_cast<float>(val), width));
-}
-
-template <>
-__forceinline__ __device__ phi::dtype::complex<float> CudaShuffleXorSync(
-    unsigned mask, phi::dtype::complex<float> val, int width) {
-  float real = __shfl_xor(val.real, width);
-  float imag = __shfl_xor(val.imag, width);
-  return phi::dtype::complex<float>(real, imag);
-}
-
-template <>
-__forceinline__ __device__ phi::dtype::complex<double> CudaShuffleXorSync(
-    unsigned mask, phi::dtype::complex<double> val, int width) {
-  double real = __shfl_xor(val.real, width);
-  double imag = __shfl_xor(val.imag, width);
-  return phi::dtype::complex<double>(real, imag);
-}
-
-template <typename T>
-__forceinline__ __device__ T
-CudaShuffleSync(unsigned mask, T val, int src_line, int width = 32) {
-  return __shfl(val, src_line, width);
-}
-
-template <typename T>
-HOSTDEVICE T Infinity() {
-  return INFINITY;
-}
-
-template <typename T>
-__device__ T reduceSum(T val, int tid, int len) {
-  // NOTE(zcd): The warp size should be taken from the
-  // parameters of the GPU but not specified as 32 simply.
-  // To make the reduceSum more efficiently,
-  // I use Warp-Level Parallelism and assume the Warp size
-  // is 32 which may be different for different GPU,
-  // but most card's warp size is 32.
-#ifdef PADDLE_WITH_HIP
-  const int warpSize = 64;
-#else
-  const int warpSize = 32;
-#endif
-  __shared__ T shm[warpSize];
-  unsigned mask = 0u;
-  CREATE_SHFL_MASK(mask, tid < len);
-
-  for (int offset = warpSize / 2; offset > 0; offset /= 2)
-    val += phi::backends::gpu::CudaShuffleDownSync(mask, val, offset);
-
-  if (tid < warpSize) shm[tid] = 0;
-  __syncthreads();
-
-  if (tid % warpSize == 0) {
-    shm[tid / warpSize] = val;
-  }
-  __syncthreads();
-
-  CREATE_SHFL_MASK(mask, tid < warpSize);
-
-  if (tid < warpSize) {
-    val = shm[tid];
-    for (int offset = warpSize / 2; offset > 0; offset /= 2)
-      val += phi::backends::gpu::CudaShuffleDownSync(mask, val, offset);
-  }
-  return val;
-}
-
-}  // namespace gpu
-}  // namespace backends
-}  // namespace phi
diff --git a/paddle/phi/backends/gpu/musa/rocm_helper.h b/paddle/phi/backends/gpu/musa/rocm_helper.h
deleted file mode 100644
index 07fdde5a2f417..0000000000000
--- a/paddle/phi/backends/gpu/musa/rocm_helper.h
+++ /dev/null
@@ -1,74 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-namespace phi {
-namespace backends {
-namespace gpu {
-
-/*
- * Summary: Grid stride looping macro in CUDA kernel
- *
- *  [ Why need this macro? ]
- *
- *    The original looping in CUDA kernel is:
- *
- *    `for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
- *        i += blockDim.x * gridDim.x)`
- *
- *    This for condition is risky. The value of `blockIdx.x * blockDim.x`
- *    may be large, such as over 1GB, the first iteration is no problem here,
- *    but when `i += blockDim.x * gridDim.x` is executed, the value of i
- *    will greater than INT_MAX and overflow becomes negative value, at
- *    this time, the cycle condition `i < (n)` is still satisfied, so it
- *    will cause illegal access to cuda memory.
- *
- *    Here is a real example in ERINE, it will trigger above error.
- *    The related data are:
- *      - blockIdx.x = 2172938
- *      - blockDim.x = 512
- *      - blockIdx.x * blockDim.x = 1112543864
- *      - INT_MAX = 2147483647
- *
- *    So we polish the for condition as follow, the int64_t __index__ will
- *    prevent overflow in the loop increment.
- *
- * Parameters:
- *    - i: loop index
- *    - num: total element numbers
- *
- * Examples:
- *    template <typename T>
- *    __global__ void Scale(T* logit_grad, const T* loss_grad, const int num,
- *                      const int d, const int remain) {
- *    CUDA_KERNEL_LOOP(index, num) {
- *      int idx_n = index / d;
- *      int idx_remain = index % remain;
- *      logit_grad[index] *= loss_grad[idx_n * remain + idx_remain];
- *      }
- *    }
- *
- */
-
-#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)                           \
-  int64_t __index__ =                                                       \
-      static_cast<int64_t>(hipBlockIdx_x) * hipBlockDim_x + hipThreadIdx_x; \
-  int64_t __stride__ = static_cast<int64_t>(hipBlockDim_x) * hipGridDim_x;  \
-  for (index_type i = __index__; __index__ < (num);                         \
-       __index__ += __stride__, i = __index__)
-
-}  // namespace gpu
-}  // namespace backends
-}  // namespace phi
diff --git a/paddle/phi/backends/gpu/musa/rocm_info.cc b/paddle/phi/backends/gpu/musa/rocm_info.cc
deleted file mode 100644
index 32c7c329253b1..0000000000000
--- a/paddle/phi/backends/gpu/musa/rocm_info.cc
+++ /dev/null
@@ -1,334 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <array>
-
-#include "paddle/phi/backends/gpu/gpu_info.h"
-
-#include "paddle/phi/core/enforce.h"
-
-static std::once_flag g_device_props_size_init_flag;
-static std::vector<std::unique_ptr<std::once_flag>> g_device_props_init_flags;
-static std::vector<phi::gpuDeviceProp> g_device_props;
-
-namespace phi {
-namespace backends {
-namespace gpu {
-
-int DnnVersion() {
-  if (!dynload::HasCUDNN()) return -1;
-  size_t version_major, version_minor, version_patch;
-  PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenGetVersion(
-      &version_major, &version_minor, &version_patch));
-  return version_major * 100 + version_minor * 10 + version_patch;
-}
-
-static int GetGPUDeviceCountImpl() {
-  int driverVersion = 0;
-  musaError_t status = musaDriverGetVersion(&driverVersion);
-
-  if (!(status == gpuSuccess && driverVersion != 0)) {
-    // No GPU driver
-    VLOG(2) << "GPU Driver Version can't be detected. No GPU driver!";
-    return 0;
-  }
-
-  const auto *cuda_visible_devices = std::getenv("MUSA_VISIBLE_DEVICES");
-
-  if (cuda_visible_devices != nullptr) {
-    std::string cuda_visible_devices_str(cuda_visible_devices);
-    if (!cuda_visible_devices_str.empty()) {
-      cuda_visible_devices_str.erase(
-          0, cuda_visible_devices_str.find_first_not_of('\''));
-      cuda_visible_devices_str.erase(
-          cuda_visible_devices_str.find_last_not_of('\'') + 1);
-      cuda_visible_devices_str.erase(
-          0, cuda_visible_devices_str.find_first_not_of('\"'));
-      cuda_visible_devices_str.erase(
-          cuda_visible_devices_str.find_last_not_of('\"') + 1);
-    }
-    if (std::all_of(cuda_visible_devices_str.begin(),
-                    cuda_visible_devices_str.end(),
-                    [](char ch) { return ch == ' '; })) {
-      VLOG(2) << "MUSA_VISIBLE_DEVICES is set to be "
-                 "empty. No GPU detected.";
-      return 0;
-    }
-  }
-  int count;
-  PADDLE_ENFORCE_GPU_SUCCESS(musaGetDeviceCount(&count));
-  return count;
-}
-
-int GetGPUDeviceCount() {
-  // cache the count
-  static auto dev_cnt = GetGPUDeviceCountImpl();
-  return dev_cnt;
-}
-
-int GetGPUComputeCapability(int id) {
-  PADDLE_ENFORCE_LT(
-      id,
-      GetGPUDeviceCount(),
-      phi::errors::InvalidArgument("Device id must be less than GPU count, "
-                                   "but received id is: %d. GPU count is: %d.",
-                                   id,
-                                   GetGPUDeviceCount()));
-  int major, minor;
-  auto major_error_code = musaDeviceGetAttribute(
-      &major, musaDeviceAttributeComputeCapabilityMajor, id);
-  auto minor_error_code = musaDeviceGetAttribute(
-      &minor, musaDeviceAttributeComputeCapabilityMinor, id);
-
-  PADDLE_ENFORCE_GPU_SUCCESS(major_error_code);
-  PADDLE_ENFORCE_GPU_SUCCESS(minor_error_code);
-  return major * 100 + minor;
-}
-
-int GetGPURuntimeVersion(int id) {
-  PADDLE_ENFORCE_LT(
-      id,
-      GetGPUDeviceCount(),
-      phi::errors::InvalidArgument("Device id must be less than GPU count, "
-                                   "but received id is: %d. GPU count is: %d.",
-                                   id,
-                                   GetGPUDeviceCount()));
-  int runtime_version = 0;
-  PADDLE_ENFORCE_GPU_SUCCESS(musaRuntimeGetVersion(&runtime_version));
-  return runtime_version;
-}
-
-int GetGPUDriverVersion(int id) {
-  PADDLE_ENFORCE_LT(
-      id,
-      GetGPUDeviceCount(),
-      phi::errors::InvalidArgument("Device id must be less than GPU count, "
-                                   "but received id is: %d. GPU count is: %d.",
-                                   id,
-                                   GetGPUDeviceCount()));
-  int driver_version = 0;
-  PADDLE_ENFORCE_GPU_SUCCESS(musaDriverGetVersion(&driver_version));
-  return driver_version;
-}
-
-bool TensorCoreAvailable() { return false; }
-
-int GetGPUMultiProcessors(int id) {
-  PADDLE_ENFORCE_LT(
-      id,
-      GetGPUDeviceCount(),
-      phi::errors::InvalidArgument("Device id must be less than GPU count, "
-                                   "but received id is: %d. GPU count is: %d.",
-                                   id,
-                                   GetGPUDeviceCount()));
-  int count;
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      musaDeviceGetAttribute(&count, musaDeviceAttributeMultiprocessorCount, id));
-  return count;
-}
-
-int GetGPUMaxThreadsPerMultiProcessor(int id) {
-  PADDLE_ENFORCE_LT(
-      id,
-      GetGPUDeviceCount(),
-      phi::errors::InvalidArgument("Device id must be less than GPU count, "
-                                   "but received id is: %d. GPU count is: %d.",
-                                   id,
-                                   GetGPUDeviceCount()));
-  int count;
-  PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceGetAttribute(
-      &count, musaDeviceAttributeMaxThreadsPerMultiProcessor, id));
-
-  return count;
-}
-
-int GetGPUMaxThreadsPerBlock(int id) {
-  PADDLE_ENFORCE_LT(
-      id,
-      GetGPUDeviceCount(),
-      phi::errors::InvalidArgument("Device id must be less than GPU count, "
-                                   "but received id is: %d. GPU count is: %d.",
-                                   id,
-                                   GetGPUDeviceCount()));
-  int count;
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      musaDeviceGetAttribute(&count, musaDeviceAttributeMaxThreadsPerBlock, id));
-  return count;
-}
-
-int GetCurrentDeviceId() {
-  int device_id;
-  PADDLE_ENFORCE_GPU_SUCCESS(musaGetDevice(&device_id));
-  return device_id;
-}
-
-std::array<int, 3> GetGpuMaxGridDimSize(int id) {
-  PADDLE_ENFORCE_LT(
-      id,
-      GetGPUDeviceCount(),
-      phi::errors::InvalidArgument("Device id must be less than GPU count, "
-                                   "but received id is: %d. GPU count is: %d.",
-                                   id,
-                                   GetGPUDeviceCount()));
-  std::array<int, 3> ret;
-  int size;
-  auto error_code_x =
-      musaDeviceGetAttribute(&size, musaDeviceAttributeMaxGridDimX, id);
-  PADDLE_ENFORCE_GPU_SUCCESS(error_code_x);
-  ret[0] = size;
-
-  auto error_code_y =
-      musaDeviceGetAttribute(&size, musaDeviceAttributeMaxGridDimY, id);
-  PADDLE_ENFORCE_GPU_SUCCESS(error_code_y);
-  ret[1] = size;
-
-  auto error_code_z =
-      musaDeviceGetAttribute(&size, musaDeviceAttributeMaxGridDimZ, id);
-  PADDLE_ENFORCE_GPU_SUCCESS(error_code_z);
-  ret[2] = size;
-  return ret;
-}
-
-std::pair<int, int> GetGpuStreamPriorityRange() {
-  int least_priority, greatest_priority;
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      musaDeviceGetStreamPriorityRange(&least_priority, &greatest_priority));
-  return std::make_pair(least_priority, greatest_priority);
-}
-
-const gpuDeviceProp &GetDeviceProperties(int id) {
-  std::call_once(g_device_props_size_init_flag, [&] {
-    int gpu_num = 0;
-    gpu_num = GetGPUDeviceCount();
-    g_device_props_init_flags.resize(gpu_num);
-    g_device_props.resize(gpu_num);
-    for (int i = 0; i < gpu_num; ++i) {
-      g_device_props_init_flags[i] = std::make_unique<std::once_flag>();
-    }
-  });
-
-  if (id == -1) {
-    id = GetCurrentDeviceId();
-  }
-
-  if (id < 0 || id >= static_cast<int>(g_device_props.size())) {
-    PADDLE_THROW(phi::errors::OutOfRange(
-        "The device id %d is out of range [0, %d), where %d is the number of "
-        "devices on this machine. Because the device id should be greater than "
-        "or equal to zero and smaller than the number of gpus. Please input "
-        "appropriate device again!",
-        id,
-        static_cast<int>(g_device_props.size()),
-        static_cast<int>(g_device_props.size())));
-  }
-
-  std::call_once(*(g_device_props_init_flags[id]), [&] {
-    PADDLE_ENFORCE_GPU_SUCCESS(musaGetDeviceProperties(&g_device_props[id], id));
-  });
-
-  return g_device_props[id];
-}
-
-void SetDeviceId(int id) {
-  // TODO(qijun): find a better way to cache the cuda device count
-  PADDLE_ENFORCE_LT(
-      id,
-      GetGPUDeviceCount(),
-      phi::errors::InvalidArgument("Device id must be less than GPU count, "
-                                   "but received id is: %d. GPU count is: %d.",
-                                   id,
-                                   GetGPUDeviceCount()));
-  PADDLE_RETRY_CUDA_SUCCESS(musaSetDevice(id));
-}
-
-void GpuMemcpyAsync(void *dst,
-                    const void *src,
-                    size_t count,
-                    gpuMemcpyKind kind,
-                    gpuStream_t stream) {
-  PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpyAsync(dst, src, count, kind, stream));
-}
-
-void GpuMemcpySync(void *dst,
-                   const void *src,
-                   size_t count,
-                   gpuMemcpyKind kind) {
-  PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpy(dst, src, count, kind));
-}
-
-void GpuMemcpyPeerAsync(void *dst,
-                        int dst_device,
-                        const void *src,
-                        int src_device,
-                        size_t count,
-                        gpuStream_t stream) {
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      musaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream));
-}
-
-void GpuMemcpyPeerSync(
-    void *dst, int dst_device, const void *src, int src_device, size_t count) {
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      musaMemcpyPeer(dst, dst_device, src, src_device, count));
-}
-
-void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream) {
-  PADDLE_ENFORCE_GPU_SUCCESS(musaMemsetAsync(dst, value, count, stream));
-}
-
-void GpuStreamSync(gpuStream_t stream) {
-  PADDLE_ENFORCE_GPU_SUCCESS(musaStreamSynchronize(stream));
-}
-
-void GpuDestroyStream(gpuStream_t stream) {
-  PADDLE_ENFORCE_GPU_SUCCESS(musaStreamDestroy(stream));
-}
-
-void GpuDeviceSync() { PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize()); }
-
-gpuError_t GpuGetLastError() { return musaGetLastError(); }
-
-bool IsGPUManagedMemorySupported(int dev_id) {
-  PADDLE_ENFORCE_LT(
-      dev_id,
-      GetGPUDeviceCount(),
-      phi::errors::InvalidArgument("Device id must be less than GPU count, "
-                                   "but received id is: %d. GPU count is: %d.",
-                                   dev_id,
-                                   GetGPUDeviceCount()));
-  // TODO(qili93): Hygon DTK (21.04 and 22.04) not support
-  // musaDeviceAttributeManagedMemory, temporary disable by default, to be
-  // verified in next DTK release
-  return false;
-}
-
-bool IsGPUManagedMemoryOversubscriptionSupported(int dev_id) {
-  PADDLE_ENFORCE_LT(
-      dev_id,
-      GetGPUDeviceCount(),
-      phi::errors::InvalidArgument("Device id must be less than GPU count, "
-                                   "but received id is: %d. GPU count is: %d.",
-                                   dev_id,
-                                   GetGPUDeviceCount()));
-#ifdef __linux__
-  return IsGPUManagedMemorySupported(dev_id) &&
-         GetGPUComputeCapability(dev_id) >= 60;
-#else
-  return false;
-#endif
-}
-
-}  // namespace gpu
-}  // namespace backends
-}  // namespace phi

From 342b7385f381af2addab2ec541f52af9b7b7437b Mon Sep 17 00:00:00 2001
From: "yiyuan.zhou" <yiyuan.zhou@mthreads.com>
Date: Tue, 25 Jul 2023 18:27:19 +0800
Subject: [PATCH 05/55] add musa_stream

---
 paddle/fluid/pybind/cuda_streams_py.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/paddle/fluid/pybind/cuda_streams_py.cc b/paddle/fluid/pybind/cuda_streams_py.cc
index da6dee7657c09..de97b39218157 100644
--- a/paddle/fluid/pybind/cuda_streams_py.cc
+++ b/paddle/fluid/pybind/cuda_streams_py.cc
@@ -84,6 +84,8 @@ void BindCudaStream(py::module *m_ptr) {
     paddle::platform::SetDeviceId(device_id);
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize());
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
 #endif

From 67d65fd1d8819d3f18c493a3a0e3d425aeecf583 Mon Sep 17 00:00:00 2001
From: CaiZhi <zhi.cai@mthreads.com>
Date: Mon, 24 Jul 2023 12:12:15 +0000
Subject: [PATCH 06/55] [MTAI] build(system): enable build system in paddle for
 MUSA

---
 .../operators/fused/fused_seqpool_cvm_op.cu   | 63 ++++++++++++++++++-
 .../operators/fused/multihead_matmul_op.cu    |  4 +-
 .../fluid/operators/fused/yolo_box_post_op.cu | 61 ++++++++++++++----
 .../operators/math/bert_encoder_functor.h     |  6 ++
 paddle/fluid/operators/math/sample_prob.cu    |  7 ++-
 .../optimizers/distributed_fused_lamb_op.cu   | 29 +++++++--
 .../fluid/operators/reader/buffered_reader.cc |  7 ++-
 paddle/fluid/platform/collective_helper.cc    |  2 +-
 paddle/fluid/platform/device/gpu/gpu_helper.h |  4 +-
 paddle/fluid/platform/device/gpu/gpu_info.cc  |  8 +--
 .../platform/device/gpu/gpu_resource_pool.cc  | 10 +++
 .../platform/device/gpu/gpu_resource_pool.h   |  5 ++
 paddle/fluid/platform/device/gpu/gpu_types.h  |  4 +-
 paddle/fluid/platform/enforce.h               | 10 +++
 paddle/fluid/platform/event.h                 |  3 +
 paddle/fluid/platform/profiler.cu             | 20 +++++-
 paddle/fluid/platform/profiler/profiler.cc    |  6 ++
 paddle/fluid/platform/profiler/utils.cc       |  2 +
 paddle/fluid/platform/profiler/utils.h        |  2 +
 paddle/fluid/platform/profiler_helper.h       | 12 ++++
 .../fluid/platform/stream_callback_manager.h  |  5 ++
 paddle/fluid/pybind/cuda_streams_py.cc        |  2 +
 paddle/fluid/pybind/tensor_py.h               |  3 +
 paddle/phi/core/cuda_stream.h                 |  5 --
 24 files changed, 246 insertions(+), 34 deletions(-)

diff --git a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
index 362860aa23bdf..f038190e72927 100644
--- a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
+++ b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
@@ -122,7 +122,7 @@ void FusedSeqpoolCVM(const framework::ExecutionContext
       memory::AllocShared(ctx.GetPlace(), total_ptr_len * sizeof(void *));
   void *ptr = temp_ptr->ptr();
 
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
   T **gpu_input_values = reinterpret_cast<T **>(temp_ptr->ptr());
   platform::GpuMemcpyAsync(gpu_input_values,
                            input_data.data(),
@@ -150,6 +150,34 @@ void FusedSeqpoolCVM(const framework::ExecutionContext
                            lods.size() * sizeof(size_t *),
                            hipMemcpyHostToDevice,
                            stream);
+#elif defined(PADDLE_WITH_MUSA)
+  T **gpu_input_values = reinterpret_cast<T **>(temp_ptr->ptr());
+  platform::GpuMemcpyAsync(gpu_input_values,
+                           input_data.data(),
+                           input_data.size() * sizeof(T *),
+                           musaMemcpyHostToDevice,
+                           stream);
+  T **gpu_output_values =
+      reinterpret_cast<T **>(&gpu_input_values[input_data.size()]);
+  platform::GpuMemcpyAsync(gpu_output_values,
+                           output_data.data(),
+                           output_data.size() * sizeof(T *),
+                           musaMemcpyHostToDevice,
+                           stream);
+  T **gpu_seqpool_output_values =
+      reinterpret_cast<T **>(&gpu_output_values[output_data.size()]);
+  platform::GpuMemcpyAsync(gpu_seqpool_output_values,
+                           seqpool_output_data.data(),
+                           seqpool_output_data.size() * sizeof(T *),
+                           musaMemcpyHostToDevice,
+                           stream);
+  size_t **lods_values = reinterpret_cast<size_t **>(
+      &gpu_seqpool_output_values[seqpool_output_data.size()]);
+  platform::GpuMemcpyAsync(lods_values,
+                           lods.data(),
+                           lods.size() * sizeof(size_t *),
+                           musaMemcpyHostToDevice,
+                           stream);
 #else
   T **gpu_input_values = reinterpret_cast<T **>(temp_ptr->ptr());
   platform::GpuMemcpyAsync(gpu_input_values,
@@ -325,7 +353,7 @@ void FusedSeqpoolCVMGrad(const framework::ExecutionContext &ctx,
                          cvm_data.size() + lods.size();
   auto temp_ptr =
       memory::AllocShared(ctx.GetPlace(), total_ptr_len * sizeof(void *));
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
   T **gpu_out_grads_values = reinterpret_cast<T **>(temp_ptr->ptr());
   platform::GpuMemcpyAsync(gpu_out_grads_values,
                            out_grads_data.data(),
@@ -356,6 +384,37 @@ void FusedSeqpoolCVMGrad(const framework::ExecutionContext &ctx,
                            lods.size() * sizeof(size_t *),
                            hipMemcpyHostToDevice,
                            stream);
+#elif defined(PADDLE_WITH_MUSA)
+  T **gpu_out_grads_values = reinterpret_cast<T **>(temp_ptr->ptr());
+  platform::GpuMemcpyAsync(gpu_out_grads_values,
+                           out_grads_data.data(),
+                           out_grads_data.size() * sizeof(T *),
+                           musaMemcpyHostToDevice,
+                           stream);
+
+  T **gpu_in_grads_values =
+      reinterpret_cast<T **>(&gpu_out_grads_values[out_grads_data.size()]);
+  platform::GpuMemcpyAsync(gpu_in_grads_values,
+                           in_grads_data.data(),
+                           in_grads_data.size() * sizeof(T *),
+                           musaMemcpyHostToDevice,
+                           stream);
+
+  T **gpu_cvm_values =
+      reinterpret_cast<T **>(&gpu_in_grads_values[in_grads_data.size()]);
+  platform::GpuMemcpyAsync(gpu_cvm_values,
+                           cvm_data.data(),
+                           cvm_data.size() * sizeof(T *),
+                           musaMemcpyHostToDevice,
+                           stream);
+
+  size_t **lods_values =
+      reinterpret_cast<size_t **>(&gpu_cvm_values[cvm_data.size()]);
+  platform::GpuMemcpyAsync(lods_values,
+                           lods.data(),
+                           lods.size() * sizeof(size_t *),
+                           musaMemcpyHostToDevice,
+                           stream);
 #else
   T **gpu_out_grads_values = reinterpret_cast<T **>(temp_ptr->ptr());
   platform::GpuMemcpyAsync(gpu_out_grads_values,
diff --git a/paddle/fluid/operators/fused/multihead_matmul_op.cu b/paddle/fluid/operators/fused/multihead_matmul_op.cu
index 8402bc78ef64c..36d0de8c6c9d1 100644
--- a/paddle/fluid/operators/fused/multihead_matmul_op.cu
+++ b/paddle/fluid/operators/fused/multihead_matmul_op.cu
@@ -327,8 +327,10 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
       temp_bias_tensor.Resize({size});
       auto *temp_qk_bias = device_ctx.template Alloc<T>(
           &temp_bias_tensor, temp_bias_tensor.numel() * sizeof(T));
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
       hipMemset(temp_qk_bias, 0, sizeof(float) * size);
+#elif defined(PADDLE_WITH_MUSA)
+      musaMemset(temp_qk_bias, 0, sizeof(float) * size);
 #else
       cudaMemset(temp_qk_bias, 0, sizeof(float) * size);
 #endif
diff --git a/paddle/fluid/operators/fused/yolo_box_post_op.cu b/paddle/fluid/operators/fused/yolo_box_post_op.cu
index 72bb97a2aae9e..6b8874d289c77 100644
--- a/paddle/fluid/operators/fused/yolo_box_post_op.cu
+++ b/paddle/fluid/operators/fused/yolo_box_post_op.cu
@@ -252,9 +252,12 @@ static void YoloTensorParseCuda(
 
   // Estimate how many boxes will be choosed
   int bbox_count = 0;
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
   hipMemcpy(
       bbox_count_device_ptr, &bbox_count, sizeof(int), hipMemcpyHostToDevice);
+#elif defined(PADDLE_WITH_MUSA)
+  musaMemcpy(
+      bbox_count_device_ptr, &bbox_count, sizeof(int), musaMemcpyHostToDevice);
 #else
   cudaMemcpy(
       bbox_count_device_ptr, &bbox_count, sizeof(int), cudaMemcpyHostToDevice);
@@ -265,9 +268,12 @@ static void YoloTensorParseCuda(
                                                          class_num,
                                                          anchors_num,
                                                          prob_thresh);
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
   hipMemcpy(
       &bbox_count, bbox_count_device_ptr, sizeof(int), hipMemcpyDeviceToHost);
+#elif defined(PADDLE_WITH_MUSA)
+  musaMemcpy(
+      &bbox_count, bbox_count_device_ptr, sizeof(int), musaMemcpyDeviceToHost);
 #else
   cudaMemcpy(
       &bbox_count, bbox_count_device_ptr, sizeof(int), cudaMemcpyDeviceToHost);
@@ -280,9 +286,12 @@ static void YoloTensorParseCuda(
   float* bbox_tensor = *bboxes_tensor_ptr;
   // Update previous maximum bbox number
   if (bbox_count > *bbox_count_max_alloc) {
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
     hipFree(bbox_tensor);
     hipMalloc(&bbox_tensor, bbox_count * (5 + class_num) * sizeof(float));
+#elif defined(PADDLE_WITH_MUSA)
+    musaFree(bbox_tensor);
+    musaMalloc(&bbox_tensor, bbox_count * (5 + class_num) * sizeof(float));
 #else
     cudaFree(bbox_tensor);
     cudaMalloc(&bbox_tensor, bbox_count * (5 + class_num) * sizeof(float));
@@ -293,9 +302,12 @@ static void YoloTensorParseCuda(
 
   // Now generate bboxes
   int bbox_index = 0;
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
   hipMemcpy(
       bbox_index_device_ptr, &bbox_index, sizeof(int), hipMemcpyHostToDevice);
+#elif defined(PADDLE_WITH_MUSA)
+  musaMemcpy(
+      bbox_index_device_ptr, &bbox_index, sizeof(int), musaMemcpyHostToDevice);
 #else
   cudaMemcpy(
       bbox_index_device_ptr, &bbox_index, sizeof(int), cudaMemcpyHostToDevice);
@@ -349,13 +361,20 @@ class YoloBoxPostKernel : public framework::OpKernel<T> {
     anchors.insert(anchors.end(), anchors1.begin(), anchors1.end());
     anchors.insert(anchors.end(), anchors2.begin(), anchors2.end());
     int* device_anchors;
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
     hipMalloc(reinterpret_cast<void**>(&device_anchors),
               anchors.size() * sizeof(int));
     hipMemcpy(device_anchors,
               anchors.data(),
               anchors.size() * sizeof(int),
               hipMemcpyHostToDevice);
+#elif defined(PADDLE_WITH_MUSA)
+    musaMalloc(reinterpret_cast<void**>(&device_anchors),
+              anchors.size() * sizeof(int));
+    musaMemcpy(device_anchors,
+              anchors.data(),
+              anchors.size() * sizeof(int),
+              musaMemcpyHostToDevice);
 #else
     cudaMalloc(reinterpret_cast<void**>(&device_anchors),
                anchors.size() * sizeof(int));
@@ -384,10 +403,14 @@ class YoloBoxPostKernel : public framework::OpKernel<T> {
     int batch = context.Input<phi::DenseTensor>("ImageShape")->dims()[0];
     TensorInfo* ts_info = new TensorInfo[batch * boxes_input.size()];
     for (int i = 0; i < batch * static_cast<int>(boxes_input.size()); i++) {
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
       hipMalloc(
           reinterpret_cast<void**>(&ts_info[i].bboxes_dev_ptr),
           ts_info[i].bbox_count_max_alloc * (5 + class_num) * sizeof(float));
+#elif defined(PADDLE_WITH_MUSA)
+      musaMalloc(
+          reinterpret_cast<void**>(&ts_info[i].bboxes_dev_ptr),
+          ts_info[i].bbox_count_max_alloc * (5 + class_num) * sizeof(float));
 #else
       cudaMalloc(
           reinterpret_cast<void**>(&ts_info[i].bboxes_dev_ptr),
@@ -395,9 +418,12 @@ class YoloBoxPostKernel : public framework::OpKernel<T> {
 #endif
       ts_info[i].bboxes_host_ptr = reinterpret_cast<float*>(malloc(
           ts_info[i].bbox_count_max_alloc * (5 + class_num) * sizeof(float)));
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
       hipMalloc(reinterpret_cast<void**>(&ts_info[i].bbox_count_device_ptr),
                 sizeof(int));
+#elif defined(PADDLE_WITH_MUSA)
+      musaMalloc(reinterpret_cast<void**>(&ts_info[i].bbox_count_device_ptr),
+                sizeof(int));
 #else
       cudaMalloc(reinterpret_cast<void**>(&ts_info[i].bbox_count_device_ptr),
                  sizeof(int));
@@ -407,8 +433,10 @@ class YoloBoxPostKernel : public framework::OpKernel<T> {
     // Box index counter in gpu memory
     // *bbox_index_device_ptr used by atomicAdd
     int* bbox_index_device_ptr;
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
     hipMalloc(reinterpret_cast<void**>(&bbox_index_device_ptr), sizeof(int));
+#elif defined(PADDLE_WITH_MUSA)
+    musaMalloc(reinterpret_cast<void**>(&bbox_index_device_ptr), sizeof(int));
 #else
     cudaMalloc(reinterpret_cast<void**>(&bbox_index_device_ptr), sizeof(int));
 #endif
@@ -450,12 +478,18 @@ class YoloBoxPostKernel : public framework::OpKernel<T> {
                       bbox_count_max_alloc * (5 + class_num) * sizeof(float)));
         }
 // we need copy bbox_count_host boxes to cpu memory
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
         hipMemcpyAsync(
             ts_info[ts_id].bboxes_host_ptr,
             ts_info[ts_id].bboxes_dev_ptr,
             ts_info[ts_id].bbox_count_host * (5 + class_num) * sizeof(float),
             hipMemcpyDeviceToHost);
+#elif defined(PADDLE_WITH_MUSA)
+        musaMemcpyAsync(
+            ts_info[ts_id].bboxes_host_ptr,
+            ts_info[ts_id].bboxes_dev_ptr,
+            ts_info[ts_id].bbox_count_host * (5 + class_num) * sizeof(float),
+            musaMemcpyDeviceToHost);
 #else
         cudaMemcpyAsync(
             ts_info[ts_id].bboxes_host_ptr,
@@ -532,15 +566,20 @@ class YoloBoxPostKernel : public framework::OpKernel<T> {
       boxes_num_data[batch_id] = bbox_det_vec.size();
     }
 
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
     hipFree(bbox_index_device_ptr);
+#elif defined(PADDLE_WITH_MUSA)
+    musaFree(bbox_index_device_ptr);
 #else
     cudaFree(bbox_index_device_ptr);
 #endif
     for (int i = 0; i < batch * boxes_input.size(); i++) {
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
       hipFree(ts_info[i].bboxes_dev_ptr);
       hipFree(ts_info[i].bbox_count_device_ptr);
+#elif defined(PADDLE_WITH_MUSA)
+      musaFree(ts_info[i].bboxes_dev_ptr);
+      musaFree(ts_info[i].bbox_count_device_ptr);
 #else
       cudaFree(ts_info[i].bboxes_dev_ptr);
       cudaFree(ts_info[i].bbox_count_device_ptr);
diff --git a/paddle/fluid/operators/math/bert_encoder_functor.h b/paddle/fluid/operators/math/bert_encoder_functor.h
index a9869e5faecce..e5adc97fa7890 100644
--- a/paddle/fluid/operators/math/bert_encoder_functor.h
+++ b/paddle/fluid/operators/math/bert_encoder_functor.h
@@ -18,6 +18,12 @@ limitations under the License. */
 #include <cuda.h>
 #include <cuda_runtime.h>
 
+#include <cub/cub.cuh>  // NOLINT
+#endif
+#ifdef PADDLE_WITH_MUSA
+#include <musa.h>
+#include <musa_runtime.h>
+
 #include <cub/cub.cuh>  // NOLINT
 #endif
 #ifdef PADDLE_WITH_HIP
diff --git a/paddle/fluid/operators/math/sample_prob.cu b/paddle/fluid/operators/math/sample_prob.cu
index 0c6b49729546c..4aa38e7441917 100644
--- a/paddle/fluid/operators/math/sample_prob.cu
+++ b/paddle/fluid/operators/math/sample_prob.cu
@@ -155,11 +155,16 @@ void GPUSampleWithProb<T>::operator()(const phi::GPUContext& context,
   int num_tries = UniqSampler<T>(sampler, num_samples, s_data);
   VLOG(1) << "num_tries: " << num_tries;
 
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
   PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpy(samples_data + num_true,
                                        s_data,
                                        sizeof(int64_t) * num_samples,
                                        hipMemcpyHostToDevice));
+#elif defined(PADDLE_WITH_MUSA)
+  PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpy(samples_data + num_true,
+                                        s_data,
+                                        sizeof(int64_t) * num_samples,
+                                        hipMemcpyHostToDevice));
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(samples_data + num_true,
                                         s_data,
diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
index cad7e38ba1c1a..ba520f026bf7a 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
@@ -32,6 +32,11 @@
 #include "cub/cub.cuh"
 #include "math.h"  // NOLINT
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#include "math.h"  // NOLINT
+#endif
+
 
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
@@ -51,8 +56,10 @@ using phi::funcs::ToVector;
 template <typename T>
 static void FillZeroWithPtr(T *x, size_t n, gpuStream_t stream) {
   static_assert(!std::is_same<T, void>::value, "T cannot be void.");
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
   PADDLE_ENFORCE_GPU_SUCCESS(hipMemsetAsync(x, 0, n * sizeof(T), stream));
+#elif defined(PADDLE_WITH_MUSA)
+  PADDLE_ENFORCE_GPU_SUCCESS(musaMemsetAsync(x, 0, n * sizeof(T), stream));
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(x, 0, n * sizeof(T), stream));
 #endif
@@ -250,10 +257,14 @@ static void LogParamAndTrustRatioDivSquareNorm(
 static bool IsFinite(const phi::GPUContext &dev_ctx, const float *ptr) {
   auto stream = dev_ctx.stream();
   float cpu_value;
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
   PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(
       &cpu_value, ptr, sizeof(float), hipMemcpyDeviceToHost, stream));
   PADDLE_ENFORCE_GPU_SUCCESS(hipStreamSynchronize(stream));
+#elif defined(PADDLE_WITH_MUSA)
+  PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpyAsync(
+      &cpu_value, ptr, sizeof(float), musaMemcpyDeviceToHost, stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(musaStreamSynchronize(stream));
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(
       &cpu_value, ptr, sizeof(float), cudaMemcpyDeviceToHost, stream));
@@ -1129,10 +1140,14 @@ static std::string GetMinMaxStr(const T *x, size_t n, const phi::Place &place) {
                     stream,
                     &cub_buffer);
     T ret_cpu[2];
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
     PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(
         &ret_cpu[0], ret, 2 * sizeof(T), hipMemcpyDeviceToHost, stream));
     PADDLE_ENFORCE_GPU_SUCCESS(hipStreamSynchronize(stream));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpyAsync(
+        &ret_cpu[0], ret, 2 * sizeof(T), musaMemcpyDeviceToHost, stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(musaStreamSynchronize(stream));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(
         &ret_cpu[0], ret, 2 * sizeof(T), cudaMemcpyDeviceToHost, stream));
@@ -1183,12 +1198,18 @@ static bool HasNanInf(const phi::GPUContext &dev_ctx, const T *x, int numel) {
                   dev_ctx.stream(),
                   &buffer);
   bool flag;
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
   PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(&flag,
                                             out.Get<bool>(),
                                             sizeof(flag),
                                             hipMemcpyDeviceToHost,
                                             dev_ctx.stream()));
+#elif defined(PADDLE_WITH_MUSA)
+  PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpyAsync(&flag,
+                                             out.Get<bool>(),
+                                             sizeof(flag),
+                                             musaMemcpyDeviceToHost,
+                                             dev_ctx.stream()));
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(&flag,
                                              out.Get<bool>(),
diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc
index 73b3823d3e5ab..8255acecb3707 100644
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -192,11 +192,16 @@ void BufferedReader::ReadAsync(size_t i) {
         // cuda[i].mutable_data() is called, since some ops release
         // cuda memory immediately without waiting cuda kernel ends
         platform::SetDeviceId(place_.device);
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
         PADDLE_ENFORCE_GPU_SUCCESS(
             hipEventRecord(events_[i].get(), compute_stream_));
         PADDLE_ENFORCE_GPU_SUCCESS(
             hipStreamWaitEvent(stream_.get(), events_[i].get(), 0));
+#elif defined(PADDLE_WITH_MUSA)
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            musaEventRecord(events_[i].get(), compute_stream_));
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            musaStreamWaitEvent(stream_.get(), events_[i].get(), 0));
 #else
         PADDLE_ENFORCE_GPU_SUCCESS(
             cudaEventRecord(events_[i].get(), compute_stream_));
diff --git a/paddle/fluid/platform/collective_helper.cc b/paddle/fluid/platform/collective_helper.cc
index a6c2b9d61dd2b..941cd49cd361d 100644
--- a/paddle/fluid/platform/collective_helper.cc
+++ b/paddle/fluid/platform/collective_helper.cc
@@ -172,7 +172,7 @@ void NCCLCommContext::CreateNCCLCommMultiTrainer(
   {
     PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclGroupStart());
     for (int i = 0; i < kDevices; i++) {
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
       PADDLE_ENFORCE_GPU_SUCCESS(hipSetDevice(i));
 #elif defined(PADDLE_WITH_MUSA)
       PADDLE_ENFORCE_GPU_SUCCESS(musaSetDevice(i));
diff --git a/paddle/fluid/platform/device/gpu/gpu_helper.h b/paddle/fluid/platform/device/gpu/gpu_helper.h
index 7fde4429bb7f3..ac096b94bed84 100644
--- a/paddle/fluid/platform/device/gpu/gpu_helper.h
+++ b/paddle/fluid/platform/device/gpu/gpu_helper.h
@@ -15,8 +15,10 @@
 #pragma once
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/device/gpu/rocm/rocm_helper.h"
+#elif defined(PADDLE_WITH_MUSA)
+#include "paddle/fluid/platform/device/gpu/musa/musa_helper.h"
 #else
 #include "paddle/fluid/platform/device/gpu/cuda/cuda_helper.h"
 #include "paddle/fluid/platform/device/gpu/cuda/cusparse_helper.h"
diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc
index 73fe0ca05ba73..2959773e14737 100644
--- a/paddle/fluid/platform/device/gpu/gpu_info.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_info.cc
@@ -33,7 +33,7 @@ limitations under the License. */
 #include "paddle/fluid/string/split.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/dynload/miopen.h"
 #elif defined(PADDLE_WITH_MUSA)
 //TODO(Xiaokang Shang)
@@ -212,7 +212,7 @@ class RecordedGpuMallocHelper {
 
     CUDADeviceGuard guard(dev_id_);
     gpuError_t result;
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
     if (UNLIKELY(malloc_managed_memory)) {
       result = hipMallocManaged(ptr, size);
     } else {
@@ -267,7 +267,7 @@ class RecordedGpuMallocHelper {
     // process is terminating, in which case we don't care if
     // cudaFree succeeds.
     CUDADeviceGuard guard(dev_id_);
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
     auto err = hipFree(ptr);
     if (err != hipErrorDeinitialized) {
 #elif defined(PADDLE_WITH_MUSA)
@@ -318,7 +318,7 @@ class RecordedGpuMallocHelper {
                   size_t *actual_total) {
     {
       CUDADeviceGuard guard(dev_id_);
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
       auto result = hipMemGetInfo(actual_avail, actual_total);
 #elif defined(PADDLE_WITH_MUSA)
       auto result = musaMemGetInfo(actual_avail, actual_total);
diff --git a/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc b/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc
index ee60040f09074..d8e9197bf6ea5 100644
--- a/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc
@@ -30,6 +30,9 @@ CudaStreamResourcePool::CudaStreamResourcePool() {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          musaStreamCreateWithFlags(&stream, musaStreamNonBlocking));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
@@ -41,6 +44,8 @@ CudaStreamResourcePool::CudaStreamResourcePool() {
       platform::SetDeviceId(dev_idx);
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(stream));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(musaStreamDestroy(stream));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(stream));
 #endif
@@ -82,6 +87,9 @@ CudaEventResourcePool::CudaEventResourcePool() {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipEventCreateWithFlags(&event, hipEventDisableTiming));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          musaEventCreateWithFlags(&event, musaEventDisableTiming));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
@@ -93,6 +101,8 @@ CudaEventResourcePool::CudaEventResourcePool() {
       platform::SetDeviceId(dev_idx);
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(event));
 #endif
diff --git a/paddle/fluid/platform/device/gpu/gpu_resource_pool.h b/paddle/fluid/platform/device/gpu/gpu_resource_pool.h
index ff1452153e7bd..8de12bba141c6 100644
--- a/paddle/fluid/platform/device/gpu/gpu_resource_pool.h
+++ b/paddle/fluid/platform/device/gpu/gpu_resource_pool.h
@@ -21,6 +21,11 @@
 #include <cuda_runtime.h>
 #endif
 
+#ifdef PADDLE_WITH_MUSA
+#include <musa.h>
+#include <musa_runtime.h>
+#endif
+
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
diff --git a/paddle/fluid/platform/device/gpu/gpu_types.h b/paddle/fluid/platform/device/gpu/gpu_types.h
index b3d4c7071c216..dac2add9f82c1 100644
--- a/paddle/fluid/platform/device/gpu/gpu_types.h
+++ b/paddle/fluid/platform/device/gpu/gpu_types.h
@@ -101,7 +101,7 @@ using CUDAGraphID = unsigned long long;  // NOLINT
 
 #undef DECLARE_TYPE_FOR_GPU
 
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
 #define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \
   constexpr auto GPU_CV = ROCM_CV;
 #elif defined(PADDLE_WITH_MUSA)
@@ -116,7 +116,7 @@ using CUDAGraphID = unsigned long long;  // NOLINT
 DECLARE_CONSTANT_FOR_GPU(gpuErrorOutOfMemory,
                          cudaErrorMemoryAllocation,
                          hipErrorOutOfMemory,
-                         musaErrorOutOfMemory);
+                         musaErrorMemoryAllocation);
 DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady, cudaErrorNotReady, hipErrorNotReady, musaErrorNotReady);
 DECLARE_CONSTANT_FOR_GPU(gpuSuccess, cudaSuccess, hipSuccess, musaSuccess);
 
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index 105c5f0607f69..160d6fb9912cb 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -38,6 +38,16 @@ limitations under the License. */
 #include <thrust/system_error.h>
 #endif  // PADDLE_WITH_CUDA
 
+#ifdef PADDLE_WITH_MUSA
+#include <mublas.h>
+#include <mudnn.h>
+#include <mufft.h>
+#include <murand.h>
+#include <musparse.h>
+#include <thrust/system/musa/error.h>
+#include <thrust/system_error.h>
+#endif  // PADDLE_WITH_MUSA
+
 #ifdef PADDLE_WITH_HIP
 #include <hiprand.h>
 #include <miopen/miopen.h>
diff --git a/paddle/fluid/platform/event.h b/paddle/fluid/platform/event.h
index e807a54fdee2d..e1a40cb8f7f64 100644
--- a/paddle/fluid/platform/event.h
+++ b/paddle/fluid/platform/event.h
@@ -21,6 +21,9 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include <cuda_runtime.h>
 #endif
+#ifdef PADDLE_WITH_MUSA
+#include <musa_runtime.h>
+#endif
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
diff --git a/paddle/fluid/platform/profiler.cu b/paddle/fluid/platform/profiler.cu
index 5d1caffd45326..1f9bacecfea4b 100644
--- a/paddle/fluid/platform/profiler.cu
+++ b/paddle/fluid/platform/profiler.cu
@@ -16,6 +16,10 @@ limitations under the License. */
 #include <cuda.h>
 #endif
 
+#ifdef PADDLE_WITH_MUSA
+#include <musa.h>
+#endif
+
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
@@ -38,7 +42,7 @@ static void ForEachDevice(std::function<void(int)> func) {
 }
 
 void DummyKernelAndEvent() {
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
   for (int i = 0; i < 5; i++) {
     ForEachDevice([](int d) {
       platform::SetDeviceId(d);
@@ -52,6 +56,20 @@ void DummyKernelAndEvent() {
       PADDLE_ENFORCE_GPU_SUCCESS(hipFree(ptr));
     });
   }
+#elif defined(PADDLE_WITH_MUSA)
+  for (int i = 0; i < 5; i++) {
+    ForEachDevice([](int d) {
+      platform::SetDeviceId(d);
+      musaStream_t stream;
+      PADDLE_ENFORCE_GPU_SUCCESS(musaStreamCreate(&stream));
+      Mark("_cuda_startup_");
+      int *ptr;
+      PADDLE_ENFORCE_GPU_SUCCESS(musaMalloc(&ptr, sizeof(int)));
+      DummyKernel<<<1, 1, 0, stream>>>(ptr);
+      PADDLE_ENFORCE_GPU_SUCCESS(musaStreamSynchronize(stream));
+      PADDLE_ENFORCE_GPU_SUCCESS(musaFree(ptr));
+    });
+  }
 #else
   for (int i = 0; i < 5; i++) {
     ForEachDevice([](int d) {
diff --git a/paddle/fluid/platform/profiler/profiler.cc b/paddle/fluid/platform/profiler/profiler.cc
index 8f34d5acc0bee..ca3211ba103aa 100644
--- a/paddle/fluid/platform/profiler/profiler.cc
+++ b/paddle/fluid/platform/profiler/profiler.cc
@@ -18,6 +18,9 @@
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
 #endif
+#ifdef PADDLE_WITH_MUSA
+#include <musa.h>
+#endif
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
@@ -43,6 +46,9 @@ void SynchronizeDevice() {
 #ifdef PADDLE_WITH_CUDA
   PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
 #endif
+#ifdef PADDLE_WITH_MUSA
+  PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize());
+#endif
 #ifdef PADDLE_WITH_HIP
   PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
 #endif
diff --git a/paddle/fluid/platform/profiler/utils.cc b/paddle/fluid/platform/profiler/utils.cc
index 7fb25b25577c4..a4fb29b86f43f 100644
--- a/paddle/fluid/platform/profiler/utils.cc
+++ b/paddle/fluid/platform/profiler/utils.cc
@@ -93,6 +93,8 @@ float CalculateEstOccupancy(uint32_t DeviceId,
   return occupancy;
 }
 
+#elif defined(PADDLE_WITH_MUSA)
+
 #else
 
 float CalculateEstOccupancy(uint32_t DeviceId,
diff --git a/paddle/fluid/platform/profiler/utils.h b/paddle/fluid/platform/profiler/utils.h
index c9437e0e7793a..5adaadf87d288 100644
--- a/paddle/fluid/platform/profiler/utils.h
+++ b/paddle/fluid/platform/profiler/utils.h
@@ -133,6 +133,8 @@ float CalculateEstOccupancy(uint32_t DeviceId,
                             int32_t BlockZ,
                             void* kernelFunc,
                             uint8_t launchType);
+#elif defined(PADDLE_WITH_MUSA)
+
 #else
 float CalculateEstOccupancy(uint32_t deviceId,
                             uint16_t registersPerThread,
diff --git a/paddle/fluid/platform/profiler_helper.h b/paddle/fluid/platform/profiler_helper.h
index 5dad7788d0b09..2fa0ece0f9883 100644
--- a/paddle/fluid/platform/profiler_helper.h
+++ b/paddle/fluid/platform/profiler_helper.h
@@ -31,6 +31,9 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
 #endif  // PADDLE_WITH_CUDA
+#ifdef PADDLE_WITH_MUSA
+#include <musa.h>
+#endif  // PADDLE_WITH_MUSA
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
@@ -103,6 +106,15 @@ void SynchronizeAllDevice() {
   }
   SetDeviceId(pre_device_id);
 #endif
+#ifdef PADDLE_WITH_MUSA
+  int pre_device_id = GetCurrentDeviceId();
+  int count = GetGPUDeviceCount();
+  for (int i = 0; i < count; i++) {
+    SetDeviceId(i);
+    PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize());
+  }
+  SetDeviceId(pre_device_id);
+#endif
 #ifdef PADDLE_WITH_HIP
   int pre_device_id = GetCurrentDeviceId();
   int count = GetGPUDeviceCount();
diff --git a/paddle/fluid/platform/stream_callback_manager.h b/paddle/fluid/platform/stream_callback_manager.h
index 7cd6930a9d0d0..10b0a1aded0d9 100644
--- a/paddle/fluid/platform/stream_callback_manager.h
+++ b/paddle/fluid/platform/stream_callback_manager.h
@@ -21,6 +21,11 @@
 #include <cuda_runtime.h>
 #endif
 
+#ifdef PADDLE_WITH_MUSA
+#include <musa.h>
+#include <musa_runtime.h>
+#endif
+
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
diff --git a/paddle/fluid/pybind/cuda_streams_py.cc b/paddle/fluid/pybind/cuda_streams_py.cc
index da6dee7657c09..de97b39218157 100644
--- a/paddle/fluid/pybind/cuda_streams_py.cc
+++ b/paddle/fluid/pybind/cuda_streams_py.cc
@@ -84,6 +84,8 @@ void BindCudaStream(py::module *m_ptr) {
     paddle::platform::SetDeviceId(device_id);
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize());
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
 #endif
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index 8b4f4dcd62de1..b7375243d8db9 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -466,6 +466,9 @@ void SetTensorFromPyArrayT(
 #ifdef PADDLE_WITH_HIP
       paddle::platform::GpuMemcpySync(
           dst, array.data(), array.nbytes(), hipMemcpyHostToDevice);
+#elif defined(PADDLE_WITH_MUSA)
+      paddle::platform::GpuMemcpySync(
+          dst, array.data(), array.nbytes(), musaMemcpyHostToDevice);
 #else
       paddle::platform::GpuMemcpySync(
           dst, array.data(), array.nbytes(), cudaMemcpyHostToDevice);
diff --git a/paddle/phi/core/cuda_stream.h b/paddle/phi/core/cuda_stream.h
index 87ab5e23818fb..26ec22f103a90 100644
--- a/paddle/phi/core/cuda_stream.h
+++ b/paddle/phi/core/cuda_stream.h
@@ -28,11 +28,6 @@ using gpuStream_t = cudaStream_t;
 using gpuStream_t = hipStream_t;
 #endif
 
-#ifdef PADDLE_WITH_CUDA
-#include <cuda_runtime.h>
-using gpuStream_t = cudaStream_t;
-#endif
-
 #ifdef PADDLE_WITH_MUSA
 #include <musa_runtime.h>
 using gpuStream_t = musaStream_t;

From a1a54eeb607ddf12ebc04d8d9d138507c00a274f Mon Sep 17 00:00:00 2001
From: CaiZhi <zhi.cai@mthreads.com>
Date: Mon, 24 Jul 2023 12:12:15 +0000
Subject: [PATCH 07/55] [MTAI] build(system): enable build system in paddle for
 MUSA

---
 paddle/fluid/framework/conv_search_cache.h    |  4 ++--
 .../fluid/inference/api/analysis_predictor.cc |  2 +-
 .../tensorrt/plugin/qkv_to_context_plugin.cu  |  2 +-
 .../fluid/memory/allocation/cuda_allocator.cc |  1 -
 .../cuda_device_context_allocator.h           |  6 ++---
 .../allocation/naive_best_fit_allocator.cc    |  2 +-
 .../memory/allocation/pinned_allocator.cc     |  4 ++--
 .../memory/allocation/system_allocator.cc     |  2 +-
 paddle/fluid/memory/memcpy.cc                 | 22 +++++++++----------
 .../fluid/operators/class_center_sample_op.cu |  4 ++--
 10 files changed, 24 insertions(+), 25 deletions(-)

diff --git a/paddle/fluid/framework/conv_search_cache.h b/paddle/fluid/framework/conv_search_cache.h
index cbac8cac4e543..aca4ce5f23d8c 100644
--- a/paddle/fluid/framework/conv_search_cache.h
+++ b/paddle/fluid/framework/conv_search_cache.h
@@ -32,7 +32,7 @@ class ConvSearchCache {
     static ConvSearchCache instance;
     return instance;
   }
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
   AlgorithmsCache<miopenConvFwdAlgorithm_t>* GetForward() {
     return &forward_cache_;
   }
@@ -69,7 +69,7 @@ class ConvSearchCache {
   ConvSearchCache(const ConvSearchCache&) {}
   ConvSearchCache& operator=(const ConvSearchCache&) {}
 
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
   AlgorithmsCache<miopenConvFwdAlgorithm_t> forward_cache_;
   AlgorithmsCache<miopenConvBwdDataAlgorithm_t> backward_data_cache_;
   AlgorithmsCache<miopenConvBwdWeightsAlgorithm_t> backward_filter_cache_;
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 12e893d72781f..12af725b6e407 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -2204,7 +2204,7 @@ void AnalysisPredictor::HookCollectShapeRangeInfo() {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       auto *dev_ctx = pool.Get(place_);
       auto stream = static_cast<phi::GPUContext *>(dev_ctx)->stream();
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
       hipStreamSynchronize(stream);
 #elif defined(PADDLE_WITH_MUSA)
       musaStreamSynchronize(stream);
diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
index ed2993e7a39e7..3c8f0694ee774 100644
--- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
@@ -215,7 +215,7 @@ void QkvToContextPluginDynamic::configurePlugin(
       fake_qk_bias_ = reinterpret_cast<float *>(
           tensor_.mutable_data<int32_t>(platform::CUDAPlace(device_id)));
       int64_t size = sizeof(int32_t) * batch * seq_len * seq_len * head_number_;
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipMemsetAsync(fake_qk_bias_, 0, size, dev_ctx.stream()));
 #elif defined(PADDLE_WITH_MUSA)
diff --git a/paddle/fluid/memory/allocation/cuda_allocator.cc b/paddle/fluid/memory/allocation/cuda_allocator.cc
index da5fdc829e8c0..51e6c88d55d50 100644
--- a/paddle/fluid/memory/allocation/cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_allocator.cc
@@ -24,7 +24,6 @@
 #include <musa_runtime.h>
 #endif
 
-#ifdef PADDLE_WITH_HIP
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
diff --git a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
index 1401aeb7a11be..42e6f7be8de31 100644
--- a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
+++ b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
@@ -79,7 +79,7 @@ class GPUContextAllocator : public Allocator {
                                gpuStream_t default_stream)
       : place_(place), default_stream_(default_stream) {
     platform::CUDADeviceGuard guard(place_.device);
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(
         hipEventCreateWithFlags(&event_, hipEventDisableTiming));
 #elif defined(PADDLE_WITH_MUSA)
@@ -94,7 +94,7 @@ class GPUContextAllocator : public Allocator {
   ~GPUContextAllocator() {
     if (event_) {
       platform::CUDADeviceGuard guard(place_.device);
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
       PADDLE_WARN_GPU_SUCCESS(hipEventDestroy(event_));
 #elif defined(PADDLE_WITH_MUSA)
       PADDLE_WARN_GPU_SUCCESS(musaEventDestroy(event_));
@@ -114,7 +114,7 @@ class GPUContextAllocator : public Allocator {
     auto allocation = new GPUContextAllocation(
         static_unique_ptr_cast<Allocation>(memory::Alloc(place_, size)));
 // Wait for the event on stream
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, default_stream_));
     PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(default_stream_, event_, 0));
 #elif defined(PADDLE_WITH_MUSA)
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
index a7af040f86c5f..93ebf7a1af16b 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -313,7 +313,7 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
         string::HumanReadableSize(Used<platform::CUDAPlace>(place))));
   } else {
     if (FLAGS_init_allocated_mem) {
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
       hipMemset(ptr, 0xEF, size);
 #elif defined(PADDLE_WITH_MUSA)
       musaMemset(ptr, 0xEF, size);
diff --git a/paddle/fluid/memory/allocation/pinned_allocator.cc b/paddle/fluid/memory/allocation/pinned_allocator.cc
index 33c6ca55880cd..4737e5c565b45 100644
--- a/paddle/fluid/memory/allocation/pinned_allocator.cc
+++ b/paddle/fluid/memory/allocation/pinned_allocator.cc
@@ -21,7 +21,7 @@ namespace memory {
 namespace allocation {
 bool CPUPinnedAllocator::IsAllocThreadSafe() const { return true; }
 void CPUPinnedAllocator::FreeImpl(phi::Allocation *allocation) {
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
   PADDLE_ENFORCE_GPU_SUCCESS(hipHostFree(allocation->ptr()));
 #elif defined(PADDLE_WITH_MUSA)
   PADDLE_ENFORCE_GPU_SUCCESS(musaHostFree(allocation->ptr()));
@@ -37,7 +37,7 @@ void CPUPinnedAllocator::FreeImpl(phi::Allocation *allocation) {
 }
 phi::Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) {
   void *ptr;
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
   PADDLE_ENFORCE_GPU_SUCCESS(hipHostMalloc(&ptr, size, hipHostMallocPortable));
 #elif defined(PADDLE_WITH_MUSA)
   PADDLE_ENFORCE_GPU_SUCCESS(musaHostMalloc(&ptr, size, musaHostMallocPortable));
diff --git a/paddle/fluid/memory/allocation/system_allocator.cc b/paddle/fluid/memory/allocation/system_allocator.cc
index 4234b615c823b..d67df333cfaba 100644
--- a/paddle/fluid/memory/allocation/system_allocator.cc
+++ b/paddle/fluid/memory/allocation/system_allocator.cc
@@ -214,7 +214,7 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
 
   void* p;
 // PINNED memory is visible to all CUDA contexts.
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
   hipError_t result = hipHostMalloc(&p, size, hipHostMallocPortable);
 #elif defined(PADDLE_WITH_MUSA)
   musaError_t result = musaHostMalloc(&p, size, musaHostMallocPortable);
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index 45b2ec3ca3875..b87cff7a7a429 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -259,7 +259,7 @@ void Copy<phi::Place, phi::XPUPlace>(phi::Place dst_place,
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024;  // 64K
 
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
 inline void SyncCUDAStream() {
 #if !defined(_WIN32)
   hipStreamSynchronize(0);
@@ -319,7 +319,7 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>(
   if (stream) {
     platform::RecordEvent record_event(
         "GpuMemcpyAsync:GPU->CPU", platform::TracerEventType::UserDefined, 1);
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
     platform::GpuMemcpyAsync(dst,
                              src,
                              num,
@@ -341,7 +341,7 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>(
   } else {
     platform::RecordEvent record_event(
         "GpuMemcpySync:GPU->CPU", platform::TracerEventType::UserDefined, 1);
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
     platform::GpuMemcpySync(dst, src, num, hipMemcpyDeviceToHost);
 #elif defined(PADDLE_WITH_MUSA)
     platform::GpuMemcpySync(dst, src, num, musaMemcpyDeviceToHost);
@@ -371,7 +371,7 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
   if (stream) {
     platform::RecordEvent record_event(
         "GpuMemcpyAsync:CPU->GPU", platform::TracerEventType::UserDefined, 1);
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
     platform::GpuMemcpyAsync(dst,
                              src,
                              num,
@@ -393,7 +393,7 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
   } else {
     platform::RecordEvent record_event(
         "GpuMemcpySync:CPU->GPU", platform::TracerEventType::UserDefined, 1);
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
     platform::GpuMemcpySync(dst, src, num, hipMemcpyHostToDevice);
 #elif defined(PADDLE_WITH_MUSA)
     platform::GpuMemcpySync(dst, src, num, musaMemcpyHostToDevice);
@@ -425,7 +425,7 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
       platform::RecordEvent record_event("GpuMemcpyAsync(same_gpu):GPU->GPU",
                                          platform::TracerEventType::UserDefined,
                                          1);
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
       platform::GpuMemcpyAsync(dst,
                                src,
                                num,
@@ -448,7 +448,7 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
       platform::RecordEvent record_event("GpuMemcpySync(same_gpu):GPU->GPU",
                                          platform::TracerEventType::UserDefined,
                                          1);
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
       platform::GpuMemcpySync(dst, src, num, hipMemcpyDeviceToDevice);
 #elif defined(PADDLE_WITH_MUSA)
       platform::GpuMemcpySync(dst, src, num, musaMemcpyDeviceToDevice);
@@ -532,7 +532,7 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
     platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CUDAPinned",
                                        platform::TracerEventType::UserDefined,
                                        1);
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
     platform::GpuMemcpyAsync(dst,
                              src,
                              num,
@@ -555,7 +555,7 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
     platform::RecordEvent record_event("GpuMemcpySync:GPU->CUDAPinned",
                                        platform::TracerEventType::UserDefined,
                                        1);
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
     platform::GpuMemcpySync(dst, src, num, hipMemcpyDeviceToHost);
 #elif defined(PADDLE_WITH_MUSA)
     platform::GpuMemcpySync(dst, src, num, musaMemcpyDeviceToHost);
@@ -582,7 +582,7 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
     platform::RecordEvent record_event("GpuMemcpyAsync:CUDAPinned->GPU",
                                        platform::TracerEventType::UserDefined,
                                        1);
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
     platform::GpuMemcpyAsync(dst,
                              src,
                              num,
@@ -605,7 +605,7 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
     platform::RecordEvent record_event("GpuMemcpySync:CUDAPinned->GPU",
                                        platform::TracerEventType::UserDefined,
                                        1);
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
     platform::GpuMemcpySync(dst, src, num, hipMemcpyHostToDevice);
 #elif defined(PADDLE_WITH_MUSA)
     platform::GpuMemcpySync(dst, src, num, musaMemcpyHostToDevice);
diff --git a/paddle/fluid/operators/class_center_sample_op.cu b/paddle/fluid/operators/class_center_sample_op.cu
index 5327be6909b4f..2c4b4f1ceacf6 100644
--- a/paddle/fluid/operators/class_center_sample_op.cu
+++ b/paddle/fluid/operators/class_center_sample_op.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
 #include <hiprand.h>
 #include <hiprand_kernel.h>
 
@@ -72,7 +72,7 @@ __global__ void RandomSampleClassCenter(const int64_t n,
   size_t local_seed =
       (static_cast<size_t>(seed) + 0x9E3779B9U +
        (static_cast<size_t>(id) << 6U) + (static_cast<size_t>(id) >> 2U));
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
   hiprand_init(local_seed, id, increment, &localState);
   CUDA_KERNEL_LOOP(i, n) {
     buffer[i] = static_cast<T>(hiprand(&localState) % max_val);

From 7e92cf78a4775e7aeb0a6d13e3b1f1c9150ed9c0 Mon Sep 17 00:00:00 2001
From: Xiaokang Shang <xiaokang.shang@mthreds.com>
Date: Wed, 26 Jul 2023 10:11:12 +0000
Subject: [PATCH 08/55] change kernels

---
 paddle/phi/kernels/autotune/gpu_timer.h       | 16 ++++++
 paddle/phi/kernels/batch_norm_kernel.cc       |  2 +-
 paddle/phi/kernels/coalesce_tensor_kernel.cc  |  2 +-
 paddle/phi/kernels/cpu/gelu_grad_kernel.cc    |  2 +-
 paddle/phi/kernels/cpu/gelu_kernel.cc         |  2 +-
 paddle/phi/kernels/funcs/blas/blas.h          |  9 ++--
 paddle/phi/kernels/funcs/blas/blas_impl.h     |  4 +-
 paddle/phi/kernels/funcs/dropout_impl.cu.h    | 19 +++++++
 paddle/phi/kernels/funcs/embedding_grad.h     |  6 +--
 paddle/phi/kernels/funcs/fft.cu               |  7 ++-
 paddle/phi/kernels/funcs/fft_cache.h          |  2 +
 paddle/phi/kernels/funcs/layer_norm_impl.cu.h |  4 +-
 paddle/phi/kernels/funcs/math_cuda_utils.h    |  3 ++
 paddle/phi/kernels/funcs/select_impl.cu.h     |  3 ++
 paddle/phi/kernels/funcs/softmax.cu           |  4 ++
 paddle/phi/kernels/funcs/sparse/sparse_blas.h |  3 ++
 .../fusion/gpu/fused_dropout_add_kernel.cu    |  4 ++
 .../fusion/gpu/fused_softmax_mask_utils.h     |  6 ++-
 .../phi/kernels/gpu/activation_grad_kernel.cu |  2 +-
 paddle/phi/kernels/gpu/activation_kernel.cu   |  2 +-
 paddle/phi/kernels/gpu/allclose_kernel.cu     |  2 +
 .../phi/kernels/gpu/batch_norm_grad_kernel.cu | 29 +++++++++--
 paddle/phi/kernels/gpu/batch_norm_kernel.cu   | 22 ++++++--
 .../kernels/gpu/cross_entropy_grad_kernel.cu  |  2 +-
 .../phi/kernels/gpu/cross_entropy_kernel.cu   | 12 +++++
 .../phi/kernels/gpu/cudnn_lstm_grad_kernel.cu | 10 +++-
 paddle/phi/kernels/gpu/cudnn_lstm_kernel.cu   | 51 ++++++++++++++++++-
 paddle/phi/kernels/gpu/cudnn_lstm_utils.h     |  3 ++
 paddle/phi/kernels/gpu/cum_grad_kernel.cu     |  2 +-
 paddle/phi/kernels/gpu/dirichlet_kernel.cu    |  7 +++
 .../phi/kernels/gpu/embedding_grad_kernel.cu  |  3 ++
 .../phi/kernels/gpu/graph_reindex_kernel.cu   | 22 ++++++--
 .../gpu/graph_sample_neighbors_kernel.cu      | 17 +++++++
 .../kernels/gpu/graph_send_ue_recv_funcs.h    |  9 ++++
 paddle/phi/kernels/gpu/group_norm_kernel.cu   |  3 ++
 .../kernels/gpu/instance_norm_grad_kernel.cu  | 29 ++++++++++-
 .../phi/kernels/gpu/instance_norm_kernel.cu   | 30 ++++++++++-
 .../phi/kernels/gpu/layer_norm_grad_kernel.cu |  2 +-
 paddle/phi/kernels/gpu/layer_norm_kernel.cu   |  4 +-
 .../kernels/gpu/log_softmax_grad_kernel.cu    |  7 +++
 paddle/phi/kernels/gpu/log_softmax_kernel.cu  |  7 +++
 .../kernels/gpu/logcumsumexp_grad_kernel.cu   |  2 +-
 .../phi/kernels/gpu/logsumexp_function.cu.h   | 40 ++++++++++++++-
 .../phi/kernels/gpu/nll_loss_grad_kernel.cu   |  2 +
 paddle/phi/kernels/gpu/nll_loss_kernel.cu     |  2 +
 paddle/phi/kernels/gpu/rnn_functor.h          | 36 +++++++++++++
 paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc  | 16 ++++++
 paddle/phi/kernels/gpu/rnn_kernel.cu.cc       | 22 +++++++-
 .../kernels/gpu/send_u_recv_grad_kernel.cu    |  2 +
 paddle/phi/kernels/gpu/send_u_recv_kernel.cu  |  4 ++
 .../kernels/gpu/send_ue_recv_grad_kernel.cu   | 23 +++++++++
 paddle/phi/kernels/gpu/send_ue_recv_kernel.cu |  6 ++-
 paddle/phi/kernels/gpu/send_uv_grad_kernel.cu | 13 +++++
 paddle/phi/kernels/gpu/top_k_kernel.cu        |  4 +-
 .../kernels/gpudnn/affine_grid_grad_kernel.cu |  2 +-
 paddle/phi/kernels/gpudnn/conv_grad_kernel.cu | 40 +++++++++------
 paddle/phi/kernels/gpudnn/conv_kernel.cu      | 15 ++++--
 .../gpudnn/conv_transpose_grad_kernel.cu      | 51 +++++++++++--------
 .../kernels/gpudnn/conv_transpose_kernel.cu   | 19 +++++--
 paddle/phi/kernels/gpudnn/pool_grad_kernel.cu | 25 +++++++--
 paddle/phi/kernels/gpudnn/pool_kernel.cu      | 26 ++++++++--
 .../phi/kernels/gpudnn/softmax_grad_kernel.cu |  2 +-
 paddle/phi/kernels/impl/conv_cudnn_impl.h     |  2 +
 paddle/phi/kernels/impl/isclose_kernel_impl.h |  2 +
 .../kernels/impl/segment_pool_kernel_impl.h   |  5 ++
 .../kernels/primitive/datamover_primitives.h  |  4 ++
 paddle/phi/kernels/reduce_min_kernel.cc       |  5 ++
 .../kernels/sparse/batch_norm_grad_kernel.cc  |  2 +-
 .../phi/kernels/sparse/batch_norm_kernel.cc   |  2 +-
 .../phi/kernels/sparse/gpu/coalesce_kernel.cu |  4 ++
 paddle/phi/kernels/sparse/gpu/conv.cu.h       |  6 +++
 .../phi/kernels/sparse/gpu/convolution.cu.h   | 29 +++++++++++
 .../kernels/sparse/gpu/elementwise_kernel.cu  |  2 +
 .../kernels/sparse/gpu/matmul_grad_kernel.cu  | 10 +++-
 paddle/phi/kernels/sparse/gpu/pool_kernel.cu  |  2 +
 paddle/phi/kernels/sparse/gpu/slice_kernel.cu | 12 +++++
 .../kernels/sparse/gpu/softmax_grad_kernel.cu |  3 ++
 .../kernels/sparse/gpu/sparse_utils_kernel.cu | 14 +++--
 paddle/phi/kernels/strings/gpu/copy_utils.h   |  6 +++
 paddle/phi/kernels/strings/unicode.cc         |  8 +++
 80 files changed, 740 insertions(+), 105 deletions(-)

diff --git a/paddle/phi/kernels/autotune/gpu_timer.h b/paddle/phi/kernels/autotune/gpu_timer.h
index 87eca2613a7b5..3817e62791c47 100644
--- a/paddle/phi/kernels/autotune/gpu_timer.h
+++ b/paddle/phi/kernels/autotune/gpu_timer.h
@@ -23,6 +23,9 @@
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
+#ifdef PADDLE_WITH_MUSA
+#include <musa_runtime.h>
+#endif
 
 namespace phi {
 
@@ -32,6 +35,9 @@ class GpuTimer {
 #ifdef PADDLE_WITH_HIP
     hipEventCreate(&start_);
     hipEventCreate(&stop_);
+#elif defined(PADDLE_WITH_MUSA)
+    musaEventCreate(&start_);
+    musaEventCreate(&stop_);
 #else
     cudaEventCreate(&start_);
     cudaEventCreate(&stop_);
@@ -46,6 +52,9 @@ class GpuTimer {
 #ifdef PADDLE_WITH_HIP
     hipEventDestroy(start_);
     hipEventDestroy(stop_);
+#elif defined(PADDLE_WITH_MUSA)
+    musaEventDestroy(start_);
+    musaEventDestroy(stop_);
 #else
     cudaEventDestroy(start_);
     cudaEventDestroy(stop_);
@@ -55,6 +64,8 @@ class GpuTimer {
   void Start(gpuStream_t stream) {
 #ifdef PADDLE_WITH_HIP
     hipEventRecord(start_, stream);
+#elif defined(PADDLE_WITH_MUSA)
+    musaEventRecord(start_, stream);
 #else
     cudaEventRecord(start_, stream);
 #endif
@@ -63,6 +74,8 @@ class GpuTimer {
   void Stop(gpuStream_t stream) {
 #ifdef PADDLE_WITH_HIP
     hipEventRecord(stop_, stream);
+#elif defined(PADDLE_WITH_MUSA)
+    musaEventRecord(stop_, stream);
 #else
     cudaEventRecord(stop_, stream);
 #endif
@@ -73,6 +86,9 @@ class GpuTimer {
 #ifdef PADDLE_WITH_HIP
     hipEventSynchronize(stop_);
     hipEventElapsedTime(&milliseconds, start_, stop_);
+#elif defined(PADDLE_WITH_MUSA)
+    musaEventSynchronize(stop_);
+    musaEventElapsedTime(&milliseconds, start_, stop_);
 #else
     cudaEventSynchronize(stop_);
     cudaEventElapsedTime(&milliseconds, start_, stop_);
diff --git a/paddle/phi/kernels/batch_norm_kernel.cc b/paddle/phi/kernels/batch_norm_kernel.cc
index bf04c99dab0a3..dba08b0de366a 100644
--- a/paddle/phi/kernels/batch_norm_kernel.cc
+++ b/paddle/phi/kernels/batch_norm_kernel.cc
@@ -97,7 +97,7 @@ PD_REGISTER_KERNEL(batch_norm_infer,
 }
 #endif
 #endif
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(batch_norm_infer,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/coalesce_tensor_kernel.cc b/paddle/phi/kernels/coalesce_tensor_kernel.cc
index 8dcd3a1d995d8..58cacd21bba18 100644
--- a/paddle/phi/kernels/coalesce_tensor_kernel.cc
+++ b/paddle/phi/kernels/coalesce_tensor_kernel.cc
@@ -292,7 +292,7 @@ PD_REGISTER_KERNEL(coalesce_tensor,
 }
 #endif
 
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(coalesce_tensor,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/cpu/gelu_grad_kernel.cc b/paddle/phi/kernels/cpu/gelu_grad_kernel.cc
index 65ee3c1851003..81ed7170d7a24 100644
--- a/paddle/phi/kernels/cpu/gelu_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/gelu_grad_kernel.cc
@@ -64,7 +64,7 @@ struct GeluGradFunctor {
     } else {
 #if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \
     !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) &&                       \
-    !defined(PADDLE_WITH_HIP)
+    !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
       auto x_data = x.data();
       auto dx_data = dx.data();
       auto dout_data = dout.data();
diff --git a/paddle/phi/kernels/cpu/gelu_kernel.cc b/paddle/phi/kernels/cpu/gelu_kernel.cc
index dbab3bd326664..47ab1a7839066 100644
--- a/paddle/phi/kernels/cpu/gelu_kernel.cc
+++ b/paddle/phi/kernels/cpu/gelu_kernel.cc
@@ -53,7 +53,7 @@ struct GeluFunctor {
     } else {
 #if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \
     !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) &&                       \
-    !defined(PADDLE_WITH_HIP)
+    !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
       auto x_data = x.data();
       auto out_data = out.data();
       int n = std::min(x.size(), out.size());
diff --git a/paddle/phi/kernels/funcs/blas/blas.h b/paddle/phi/kernels/funcs/blas/blas.h
index 2ea7a306f16fd..3b758882e4072 100644
--- a/paddle/phi/kernels/funcs/blas/blas.h
+++ b/paddle/phi/kernels/funcs/blas/blas.h
@@ -175,7 +175,7 @@ class Blas {
              T* c,
              const int* ldc) const;
 
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
   template <typename T>
   void MatMulWithHead(const phi::DenseTensor& mat_a,
                       const MatDescriptor& dim_a,
@@ -303,7 +303,7 @@ class Blas {
                    int batchCount) const;
 
 #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
-    !defined(PADDLE_WITH_HIP)
+    !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
   template <typename T>
   void BatchedGEMMWithHead(CBLAS_TRANSPOSE transA,
                            CBLAS_TRANSPOSE transB,
@@ -445,7 +445,7 @@ class BlasT : private Blas<DeviceContext> {
     Base()->template CSRMM<T>(args...);
   }
 
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
   template <typename... ARGS>
   void MatMulWithHead(ARGS... args) const {
     Base()->template MatMulWithHead<T>(args...);
@@ -593,3 +593,6 @@ inline BlasT<DeviceContext, T> GetBlas(const DeviceContext& dev_ctx) {
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/kernels/funcs/blas/blas_impl.hip.h"
 #endif
+#ifdef PADDLE_WITH_MUSA
+// TODO
+#endif
diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.h b/paddle/phi/kernels/funcs/blas/blas_impl.h
index ffafe15b8fcf2..5e4c058ee589b 100644
--- a/paddle/phi/kernels/funcs/blas/blas_impl.h
+++ b/paddle/phi/kernels/funcs/blas/blas_impl.h
@@ -1452,7 +1452,7 @@ void Blas<phi::CPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
 }
 
 #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
-    !defined(PADDLE_WITH_HIP)  // @{ Group Blas MKLML: BatchedGEMMWithHead
+    !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) // @{ Group Blas MKLML: BatchedGEMMWithHead
 template <>
 template <typename T>
 void Blas<phi::CPUContext>::BatchedGEMMWithHead(CBLAS_TRANSPOSE transA,
@@ -1698,7 +1698,7 @@ void Blas<DeviceContext>::MatMul(const T *mat_a,
 }
 
 #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
-    !defined(PADDLE_WITH_HIP)
+    !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
 // @{ Group Blas MKLML: MatMulWithHead
 /*
  * Multiple two matrixes with multiple heads
diff --git a/paddle/phi/kernels/funcs/dropout_impl.cu.h b/paddle/phi/kernels/funcs/dropout_impl.cu.h
index a1fc2c225ecf2..b4387e594d577 100644
--- a/paddle/phi/kernels/funcs/dropout_impl.cu.h
+++ b/paddle/phi/kernels/funcs/dropout_impl.cu.h
@@ -24,6 +24,10 @@ limitations under the License. */
 #include <hip/hip_runtime.h>
 #include <hiprand_kernel.h>
 #endif
+#ifdef PADDLE_WITH_MUSA
+#include <musa_runtime.h>
+#include <murand_kernel.h>
+#endif
 
 #include "paddle/phi/kernels/funcs/dropout_impl_util.h"
 
@@ -142,6 +146,10 @@ __global__ void VectorizedRandomGenerator(const size_t n,
   hiprandStatePhilox4_32_10_t state;
   hiprand_init(seed, idx + THREAD_ID_X, increment, &state);
   using SType = hiprandStatePhilox4_32_10_t;
+#elif defined(PADDLE_WITH_MUSA)
+  murand_state_philox4x32_10 state;
+  murand_init(seed, idx + THREAD_ID_X, increment, &state);
+  using SType = murand_state_philox4x32_10;
 #else
   curandStatePhilox4_32_10_t state;
   curand_init(seed, idx + THREAD_ID_X, increment, &state);
@@ -212,6 +220,10 @@ __global__ void VectorizedGeneratorMask(const size_t n,
   hiprandStatePhilox4_32_10_t state;
   hiprand_init(seed, idx + THREAD_ID_X, increment, &state);
   using SType = hiprandStatePhilox4_32_10_t;
+#elif defined(PADDLE_WITH_MUSA)
+  murand_state_philox4x32_10 state;
+  murand_init(seed, idx + THREAD_ID_X, increment, &state);
+  using SType = murand_state_philox4x32_10;
 #else
   curandStatePhilox4_32_10_t state;
   curand_init(seed, idx + THREAD_ID_X, increment, &state);
@@ -295,6 +307,11 @@ void DropoutFwGPUKernelDriver(
           hipMemsetAsync(y_data, 0, x_numel * sizeof(T), stream));
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipMemsetAsync(mask_data, 0, x_numel * sizeof(*mask_data), stream));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          musaMemsetAsync(y_data, 0, x_numel * sizeof(T), stream));
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          musaMemsetAsync(mask_data, 0, x_numel * sizeof(*mask_data), stream));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaMemsetAsync(y_data, 0, x_numel * sizeof(T), stream));
@@ -430,6 +447,8 @@ void DropoutGradGPUKernelDriver(const phi::GPUContext& dev_ctx,
     if (upscale_in_train && dropout_prob == 1.0f) {
 #ifdef PADDLE_WITH_HIP
       hipMemset(grad_x->data<T>(), 0, grad_x->numel() * sizeof(T));
+#elif defined(PADDLE_WITH_MUSA)
+      musaMemset(grad_x->data<T>(), 0, grad_x->numel() * sizeof(T));
 #else
       cudaMemset(grad_x->data<T>(), 0, grad_x->numel() * sizeof(T));
 #endif
diff --git a/paddle/phi/kernels/funcs/embedding_grad.h b/paddle/phi/kernels/funcs/embedding_grad.h
index 3ad0f22c8e912..8d43553325277 100644
--- a/paddle/phi/kernels/funcs/embedding_grad.h
+++ b/paddle/phi/kernels/funcs/embedding_grad.h
@@ -96,7 +96,7 @@ __global__ void EmbeddingGradDeterministicKernel(T* table,
         unsigned long long int matchmask =      // NOLINT
             __ballot(match_found_this_thread);  // NOLINT
         int first_remaining_peer = __ffsll(matchmask) - 1;
-#else
+#else // MUSA and CUDA
         // If and only if match_found_this_thread of the Nth thread is non-zero,
         // set the Nth bit of matchmask to 1.
         unsigned int matchmask =
@@ -112,7 +112,7 @@ __global__ void EmbeddingGradDeterministicKernel(T* table,
           while (matchmask) {
 #ifdef PADDLE_WITH_HIP
             first_remaining_peer = __ffsll(matchmask) - 1;
-#else
+#else // CUDA and MUSA
             first_remaining_peer = __ffs(matchmask) - 1;
 #endif
             my_s[threadIdx.x] +=
@@ -142,7 +142,7 @@ void LaunchEmbeddingGradDeterministicKernel(const GPUContext& ctx,
 #ifdef PADDLE_WITH_HIP
   constexpr int kWarpSize = 64;
   constexpr int kBlockDimY = 16;
-#else
+#else // CUDA and MUSA
   constexpr int kWarpSize = 32;
   constexpr int kBlockDimY = 32;
 #endif
diff --git a/paddle/phi/kernels/funcs/fft.cu b/paddle/phi/kernels/funcs/fft.cu
index edac497bc8e8b..42786f4b64355 100644
--- a/paddle/phi/kernels/funcs/fft.cu
+++ b/paddle/phi/kernels/funcs/fft.cu
@@ -104,7 +104,7 @@ inline bool use_cache(const int64_t* signal_size) {
   }
   return using_cache;
 }
-#elif defined(PADDLE_WITH_HIP)
+#elif defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 inline bool use_cache(const int64_t* signal_size) { return true; }
 #endif
 
@@ -200,6 +200,11 @@ void exec_fft(const phi::GPUContext& ctx,
       phi::dynload::hipfftSetStream(config->plan(), ctx.stream()));
   PADDLE_ENFORCE_GPU_SUCCESS(
       phi::dynload::hipfftSetWorkArea(config->plan(), workspace_tensor.data()));
+#elif defined(PADDLE_WITH_MUSA)
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::mufftSetStream(config->plan(), ctx.stream()));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::mufftSetWorkArea(config->plan(), workspace_tensor.data()));
 #endif
 
   // execution of fft plan
diff --git a/paddle/phi/kernels/funcs/fft_cache.h b/paddle/phi/kernels/funcs/fft_cache.h
index 51e90a6c0d95b..a6f775af88ea7 100644
--- a/paddle/phi/kernels/funcs/fft_cache.h
+++ b/paddle/phi/kernels/funcs/fft_cache.h
@@ -25,6 +25,8 @@
 #include "paddle/phi/kernels/funcs/cufft_util.h"
 #elif defined(PADDLE_WITH_HIP)
 #include "paddle/phi/kernels/funcs/hipfft_util.h"
+#elif defined(PADDLE_WITH_MUSA)
+#include "paddle/phi/kernels/funcs/mufft_util.h"
 #endif
 
 namespace phi {
diff --git a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
index 1d067b0fc2918..b7aa46dcb004e 100644
--- a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
+++ b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
@@ -1350,7 +1350,7 @@ __global__ void LayerNormBackwardComputeGradInput(const T *__restrict__ dout,
       // WARP_SHFL_XOR(sum_loss, mask);
       sum_loss1 += __shfl_xor(sum_loss1, mask, warpSize);
       sum_loss2 += __shfl_xor(sum_loss2, mask, warpSize);
-#else
+#else // CUDA and MUSA
       // WARP_SHFL_XOR(sum_loss, mask);
       sum_loss1 += __shfl_xor_sync(0xffffffff, sum_loss1, mask, warpSize);
       sum_loss2 += __shfl_xor_sync(0xffffffff, sum_loss2, mask, warpSize);
@@ -1501,7 +1501,7 @@ __global__ void LayerNormBackwardComputeGradInputWithSmallFeatureSize(
       // WARP_SHFL_XOR(sum_loss, mask);
       sum_loss1 += __shfl_xor(sum_loss1, mask, warpSize);
       sum_loss2 += __shfl_xor(sum_loss2, mask, warpSize);
-#else
+#else // CUDA and MUSA
       // WARP_SHFL_XOR(sum_loss, mask);
       sum_loss1 += __shfl_xor_sync(0xffffffff, sum_loss1, mask, WarpSize);
       sum_loss2 += __shfl_xor_sync(0xffffffff, sum_loss2, mask, WarpSize);
diff --git a/paddle/phi/kernels/funcs/math_cuda_utils.h b/paddle/phi/kernels/funcs/math_cuda_utils.h
index 1a6cca7f11aae..d9fb6de531557 100644
--- a/paddle/phi/kernels/funcs/math_cuda_utils.h
+++ b/paddle/phi/kernels/funcs/math_cuda_utils.h
@@ -20,6 +20,9 @@ limitations under the License. */
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_fp16.h>
 #endif
+#ifdef PADDLE_WITH_MUSA
+#include <musa_fp16.h>
+#endif
 
 #include <algorithm>
 
diff --git a/paddle/phi/kernels/funcs/select_impl.cu.h b/paddle/phi/kernels/funcs/select_impl.cu.h
index 2976968d07b70..b3684c4d4e0ba 100644
--- a/paddle/phi/kernels/funcs/select_impl.cu.h
+++ b/paddle/phi/kernels/funcs/select_impl.cu.h
@@ -23,6 +23,9 @@
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
 #endif
+#ifdef __MCC__
+//TODO
+#endif
 
 #include <algorithm>
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
diff --git a/paddle/phi/kernels/funcs/softmax.cu b/paddle/phi/kernels/funcs/softmax.cu
index 2ca97cd4ac205..55c24e8c980ff 100644
--- a/paddle/phi/kernels/funcs/softmax.cu
+++ b/paddle/phi/kernels/funcs/softmax.cu
@@ -60,6 +60,8 @@ void SoftmaxCUDNNFunctor<T, DeviceContext>::operator()(
                                             context.template Alloc<T>(Y),
                                             MIOPEN_SOFTMAX_ACCURATE,
                                             MIOPEN_SOFTMAX_MODE_INSTANCE));
+#elif defined(PADDLE_WITH_MUSA)
+  // TODO
 #else
   cudnnTensorDescriptor_t cudnn_x_desc =
       xDesc.descriptor<T>(layout, cudnn_tensor_dims);
@@ -117,6 +119,8 @@ void SoftmaxGradCUDNNFunctor<T, DeviceContext>::operator()(
                                              context.template Alloc<T>(XGrad),
                                              MIOPEN_SOFTMAX_ACCURATE,
                                              MIOPEN_SOFTMAX_MODE_INSTANCE));
+#elif defined(PADDLE_WITH_MUSA)
+  // TODO
 #else
   cudnnTensorDescriptor_t cudnn_y_desc =
       yDesc.descriptor<T>(layout, cudnn_tensor_dims);
diff --git a/paddle/phi/kernels/funcs/sparse/sparse_blas.h b/paddle/phi/kernels/funcs/sparse/sparse_blas.h
index f6d67488d1f48..9a6534c32a1c6 100644
--- a/paddle/phi/kernels/funcs/sparse/sparse_blas.h
+++ b/paddle/phi/kernels/funcs/sparse/sparse_blas.h
@@ -100,3 +100,6 @@ inline SparseBlasT<DeviceContext, T> GetSparseBlas(
 #if defined(PADDLE_WITH_HIP) && HIP_VERSION >= 402
 #include "paddle/phi/kernels/funcs/sparse/sparse_blas_impl.hip.h"
 #endif
+#if defined(PADDLE_WITH_MUSA)
+#include "paddle/phi/kernels/funcs/sparse/sparse_blas_impl.mu.h"
+#endif
diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu
index 3cb1a6742543a..85dc7d31f2064 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu
@@ -91,6 +91,10 @@ __global__ void VectorizedDropoutForward(const size_t n,
   hiprandStatePhilox4_32_10_t state;
   hiprand_init(seed, idx + THREAD_ID_X, increment, &state);
   using SType = hiprandStatePhilox4_32_10_t;
+#elif defined(PADDLE_WITH_MUSA)
+  murand_state_philox4x32_10 state;
+  murand_init(seed, idx + THREAD_ID_X, increment, &state);
+  using SType = murand_state_philox4x32_10;
 #else
   curandStatePhilox4_32_10_t state;
   curand_init(seed, idx + THREAD_ID_X, increment, &state);
diff --git a/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_utils.h b/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_utils.h
index 418fa8bf55ce9..9c5e336a9f148 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_utils.h
+++ b/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_utils.h
@@ -22,6 +22,10 @@
 #include <hip/hip_runtime.h>
 #include <hiprand_kernel.h>
 #endif
+#ifdef PADDLE_WITH_MUSA
+#include <musa_runtime.h>
+#include <murand_kernel.h>
+#endif
 
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
 
@@ -29,7 +33,7 @@
 
 #ifdef PADDLE_WITH_HIP
 #define WARP_SIZE 64
-#else
+#else // MUSA & CUDA
 #define WARP_SIZE 32
 #endif
 
diff --git a/paddle/phi/kernels/gpu/activation_grad_kernel.cu b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
index aa703ede3bad6..3eff633ff0c51 100644
--- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
@@ -299,7 +299,7 @@ void HardSwishGradKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(relu_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/activation_kernel.cu b/paddle/phi/kernels/gpu/activation_kernel.cu
index 83e130f0a71bd..d741549bebcf9 100644
--- a/paddle/phi/kernels/gpu/activation_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_kernel.cu
@@ -186,7 +186,7 @@ PD_REGISTER_KERNEL(relu,
                    float,
                    double,
                    phi::dtype::float16) {}
-#else
+#else // CUDA & MUSA
 PD_REGISTER_KERNEL(relu,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/allclose_kernel.cu b/paddle/phi/kernels/gpu/allclose_kernel.cu
index 99ccfcd8667e6..13a65c6a64f8b 100644
--- a/paddle/phi/kernels/gpu/allclose_kernel.cu
+++ b/paddle/phi/kernels/gpu/allclose_kernel.cu
@@ -87,6 +87,8 @@ void AllCloseKernel(const Context& dev_ctx,
   grid = (grid > block) ? block : grid;
 #ifdef PADDLE_WITH_HIP
   hipMemset(out_data, true, sizeof(bool));
+#elif defined(PADDLE_WITH_MUSA)
+  musaMemset(out_data, true, sizeof(bool));
 #else
   cudaMemset(out_data, true, sizeof(bool));
 #endif
diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
index b940374556009..7546ebbaf736c 100644
--- a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
@@ -568,7 +568,7 @@ void BatchNormGradRawKernel(const Context &ctx,
           scale.dims()[0]));
 
   auto dtype = phi::backends::gpu::CudnnDataType<T>::type;
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   auto compute_format =
       data_layout == DataLayout::kNHWC ? DataLayout::kNHWC : DataLayout::kNCHW;
 
@@ -650,6 +650,15 @@ void BatchNormGradRawKernel(const Context &ctx,
 //     platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
 // PADDLE_ENFORCE_GPU_SUCCESS(
 //     platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
+#elif defined(PADDLE_WITH_MUSA)
+    mudnnTensorDescriptor_t data_desc_;
+    mudnnTensorDescriptor_t bn_param_desc_;
+    mudnnBatchNormMode_t mode_;
+
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::mudnnCreateTensorDescriptor(&data_desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::mudnnCreateTensorDescriptor(&bn_param_desc_));
 #else
     cudnnTensorDescriptor_t data_desc_;
     cudnnTensorDescriptor_t bn_param_desc_;
@@ -694,6 +703,15 @@ void BatchNormGradRawKernel(const Context &ctx,
 // PADDLE_ENFORCE_GPU_SUCCESS(
 //     platform::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_,
 //                                                       data_desc_, mode_));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnSetTensorNdDescriptor(
+        data_desc_,
+        CudnnDataType<T>::type,
+        x_dims.size() > 3 ? x_dims.size() : 4,
+        dims.data(),
+        strides.data()));
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnDeriveBNTensorDescriptor(
+        bn_param_desc_, data_desc_, mode_));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensorNdDescriptor(
         data_desc_,
@@ -1113,6 +1131,11 @@ void BatchNormGradRawKernel(const Context &ctx,
 //     platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
 // PADDLE_ENFORCE_GPU_SUCCESS(
 //     platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::mudnnDestroyTensorDescriptor(data_desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::mudnnDestroyTensorDescriptor(bn_param_desc_));
 #else
     // clean when exit.
     PADDLE_ENFORCE_GPU_SUCCESS(
@@ -1407,7 +1430,7 @@ PD_REGISTER_KERNEL(batch_norm_grad_raw,
     kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);  // bias_grad
   }
 }
-#else
+#else // CUDA & MUSA
 PD_REGISTER_KERNEL(batch_norm_grad,
                    GPU,
                    ALL_LAYOUT,
@@ -1445,7 +1468,7 @@ PD_REGISTER_KERNEL(batch_norm_double_grad,
                    phi::BatchNormDoubleGradKernel,
                    float,
                    double) {}
-#else
+#else // CUDA & MUSA
 PD_REGISTER_KERNEL(batch_norm_double_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
index 5c2d76be35992..2a4a435f9c96a 100644
--- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -551,7 +551,7 @@ void BatchNormKernel(const Context &ctx,
 
   auto dtype = phi::backends::gpu::CudnnDataType<T>::type;
 
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   auto compute_format =
       data_layout == DataLayout::kNHWC ? DataLayout::kNHWC : DataLayout::kNCHW;
 
@@ -593,6 +593,15 @@ void BatchNormKernel(const Context &ctx,
 //     platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
 // PADDLE_ENFORCE_GPU_SUCCESS(
 //     platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
+#elif defined(PADDLE_WITH_MUSA)
+  mudnnTensorDescriptor_t data_desc_;
+  mudnnTensorDescriptor_t bn_param_desc_;
+  mudnnBatchNormMode_t mode_;
+
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::mudnnCreateTensorDescriptor(&data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::mudnnCreateTensorDescriptor(&bn_param_desc_));
 #else
   cudnnTensorDescriptor_t data_desc_;
   cudnnTensorDescriptor_t bn_param_desc_;
@@ -641,7 +650,7 @@ void BatchNormKernel(const Context &ctx,
     strides = {H * W * D * C, 1, W * D * C, D * C, C};
   }
 
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 // TODO(wangran16): wait for MIOpen to improve the performance of BN
 // PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
 //     data_desc_, CudnnDataType<T>::type,
@@ -942,7 +951,7 @@ void BatchNormKernel(const Context &ctx,
 //                 ctx.GetPlace())),
 //         static_cast<void *>(saved_variance->template mutable_data<
 //                             BatchNormParamType<T>>(ctx.GetPlace()))));
-#else
+#else // CUDA & MUSA
       // const size_t CUDNN_PER_ACTIVATION_THRESHOLD = 131070;
       const bool use_native_kernel =
           ((x_dims.size() == 2 && N >= CUDNN_PER_ACTIVATION_THRESHOLD) ||
@@ -1206,6 +1215,11 @@ void BatchNormKernel(const Context &ctx,
 //     platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
 // PADDLE_ENFORCE_GPU_SUCCESS(
 //     platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
+#elif defined(PADDLE_WITH_MUSA)
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::mudnnDestroyTensorDescriptor(data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::mudnnDestroyTensorDescriptor(bn_param_desc_));
 #else
   // clean when exit.
   PADDLE_ENFORCE_GPU_SUCCESS(
@@ -1256,7 +1270,7 @@ PD_REGISTER_KERNEL(batch_norm,
     kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
   }
 }
-#else
+#else // CUDA & MUSA
 PD_REGISTER_KERNEL(batch_norm,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu b/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu
index 21dedeb94a62c..3a144b3ba7a40 100644
--- a/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu
@@ -289,7 +289,7 @@ PD_REGISTER_KERNEL(cross_entropy_with_softmax_grad,
                    float,
                    double,
                    phi::dtype::float16) {}
-#else
+#else // CUDA & MUSA
 #if CUDNN_VERSION_MIN(8, 1, 0)
 PD_REGISTER_KERNEL(cross_entropy_with_softmax_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
index f8964f4ec5312..3bd4595c48b21 100644
--- a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
+++ b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
@@ -763,6 +763,8 @@ static void SoftmaxWithCrossEntropySoftLabel(const GPUContext& dev_ctx,
     GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW;
 #ifdef PADDLE_WITH_HIP
     miopenTensorDescriptor_t descp = desc.descriptor<T>(layout, tensor_dims);
+#elif defined(PADDLE_WITH_MUSA)
+    mudnnTensorDescriptor_t descp = desc.descriptor<T>(layout, tensor_dims);
 #else
     cudnnTensorDescriptor_t descp = desc.descriptor<T>(layout, tensor_dims);
 #endif
@@ -782,12 +784,20 @@ static void SoftmaxWithCrossEntropySoftLabel(const GPUContext& dev_ctx,
         softmax_data,
         MIOPEN_SOFTMAX_LOG,
         mode));
+#else
+#ifdef PADDLE_WITH_MUSA
+    auto mode = axis == rank - 1 ? MUDNN_SOFTMAX_MODE_INSTANCE
+                                 : MUDNN_SOFTMAX_MODE_CHANNEL;
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnSoftmaxForward(
+        handle,
+        MUDNN_SOFTMAX_LOG,
 #else
     auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
                                  : CUDNN_SOFTMAX_MODE_CHANNEL;
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSoftmaxForward(
         handle,
         CUDNN_SOFTMAX_LOG,
+#endif
         mode,
         phi::backends::gpu::CudnnDataType<T>::kOne(),
         descp,
@@ -1199,6 +1209,8 @@ static void SoftmaxWithCrossEntropyHardLabel(const GPUContext& dev_ctx,
     GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW;
 #ifdef PADDLE_WITH_HIP
     miopenTensorDescriptor_t descp = desc.descriptor<T>(layout, tensor_dims);
+#elif defined(PADDLE_WITH_MUSA)
+    mudnnTensorDescriptor_t descp = desc.descriptor<T>(layout, tensor_dims);
 #else
     cudnnTensorDescriptor_t descp = desc.descriptor<T>(layout, tensor_dims);
 #endif
diff --git a/paddle/phi/kernels/gpu/cudnn_lstm_grad_kernel.cu b/paddle/phi/kernels/gpu/cudnn_lstm_grad_kernel.cu
index 661a1dd90e7e9..ff344fb47dcd6 100644
--- a/paddle/phi/kernels/gpu/cudnn_lstm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/cudnn_lstm_grad_kernel.cu
@@ -195,7 +195,11 @@ void CudnnLSTMGradKernel(
         reserve_size));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(
+#ifdef PADDLE_WITH_MUSA
+        phi::dynload::mudnnRNNBackwardData(handle,
+#else
         phi::dynload::cudnnRNNBackwardData(handle,
+#endif
                                            rnn.rnn_desc(),
                                            seq_length,
                                            rnn.y_descs(),
@@ -223,7 +227,11 @@ void CudnnLSTMGradKernel(
                                            const_cast<uint8_t *>(reserve_data),
                                            reserve_size));
 
+#ifdef PADDLE_WITH_MUSA
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnRNNBackwardWeights(
+#else
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNBackwardWeights(
+#endif
         handle,
         rnn.rnn_desc(),
         seq_length,
@@ -305,7 +313,7 @@ void CudnnLSTMGradKernel(
 #ifdef PADDLE_WITH_HIP
 PD_REGISTER_KERNEL(
     cudnn_lstm_grad, GPU, ALL_LAYOUT, phi::CudnnLSTMGradKernel, float) {}
-#else
+#else // CUDA & MUSA
 PD_REGISTER_KERNEL(
     cudnn_lstm_grad, GPU, ALL_LAYOUT, phi::CudnnLSTMGradKernel, float, double) {
 }
diff --git a/paddle/phi/kernels/gpu/cudnn_lstm_kernel.cu b/paddle/phi/kernels/gpu/cudnn_lstm_kernel.cu
index f3a03727e0bc4..bcc1f1464bed1 100644
--- a/paddle/phi/kernels/gpu/cudnn_lstm_kernel.cu
+++ b/paddle/phi/kernels/gpu/cudnn_lstm_kernel.cu
@@ -25,6 +25,9 @@ template <typename T>
 #ifdef PADDLE_WITH_HIP
 void LSTMInferece(const bool &has_seq_length,
                   const miopenHandle_t &handle,
+#elif defined(PADDLE_WITH_MUSA)
+void LSTMInferece(const bool &has_seq_length,
+                  const mudnnHandle_t &handle,
 #else
 void LSTMInferece(const bool &has_seq_length,
                   const cudnnHandle_t &handle,
@@ -64,6 +67,48 @@ void LSTMInferece(const bool &has_seq_length,
                                                 last_c_data,
                                                 workspace_data->data<uint8_t>(),
                                                 workspace_size));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::mudnnRNNForwardInference(handle,
+                                               rnn->rnn_desc(),
+                                               seq_length,
+                                               rnn->x_descs(),
+                                               x_data,
+                                               rnn->init_h_desc(),
+                                               init_h_data,
+                                               rnn->init_c_desc(),
+                                               init_c_data,
+                                               rnn->weight_desc(),
+                                               w_data,
+                                               rnn->y_descs(),
+                                               out_data,
+                                               rnn->last_h_desc(),
+                                               last_h_data,
+                                               rnn->last_c_desc(),
+                                               last_c_data,
+                                               workspace_data->data<uint8_t>(),
+                                               workspace_size));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::mudnnRNNForwardInference(handle,
+                                               rnn->rnn_desc(),
+                                               seq_length,
+                                               rnn->x_descs(),
+                                               x_data,
+                                               rnn->init_h_desc(),
+                                               init_h_data,
+                                               rnn->init_c_desc(),
+                                               init_c_data,
+                                               rnn->weight_desc(),
+                                               w_data,
+                                               rnn->y_descs(),
+                                               out_data,
+                                               rnn->last_h_desc(),
+                                               last_h_data,
+                                               rnn->last_c_desc(),
+                                               last_c_data,
+                                               workspace_data->data<uint8_t>(),
+                                               workspace_size));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(
         phi::dynload::cudnnRNNForwardInference(handle,
@@ -293,7 +338,11 @@ void CudnnLSTMKernel(
           reserve_size));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
+#ifdef PADDLE_WITH_MUSA
+          phi::dynload::mudnnRNNForwardTraining(handle,
+#else
           phi::dynload::cudnnRNNForwardTraining(handle,
+#endif
                                                 rnn.rnn_desc(),
                                                 seq_length,
                                                 rnn.x_descs(),
@@ -366,7 +415,7 @@ PD_REGISTER_KERNEL(cudnn_lstm, GPU, ALL_LAYOUT, phi::CudnnLSTMKernel, float) {
   kernel->OutputAt(3).SetDataType(phi::DataType::UINT8);
   kernel->OutputAt(4).SetDataType(phi::DataType::UINT8);
 }
-#else
+#else // CUDA & MUSA
 PD_REGISTER_KERNEL(
     cudnn_lstm, GPU, ALL_LAYOUT, phi::CudnnLSTMKernel, float, double) {
   kernel->InputAt(5).SetDataType(phi::DataType::INT32);
diff --git a/paddle/phi/kernels/gpu/cudnn_lstm_utils.h b/paddle/phi/kernels/gpu/cudnn_lstm_utils.h
index e5fc51849454d..033efe0b9e7b5 100644
--- a/paddle/phi/kernels/gpu/cudnn_lstm_utils.h
+++ b/paddle/phi/kernels/gpu/cudnn_lstm_utils.h
@@ -26,6 +26,9 @@
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/kernels/gpu/miopen_lstm_cache.h"
 #endif
+#ifdef PADDLE_WITH_MUSA
+#include "paddle/phi/kernels/gpu/mudnn_lstm_cache.h"
+#endif
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/cum_grad_kernel.cu b/paddle/phi/kernels/gpu/cum_grad_kernel.cu
index 620d185475ef9..d92dab27c8c15 100644
--- a/paddle/phi/kernels/gpu/cum_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/cum_grad_kernel.cu
@@ -63,7 +63,7 @@ void CumsumGradKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(cumsum_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/dirichlet_kernel.cu b/paddle/phi/kernels/gpu/dirichlet_kernel.cu
index 09d6a402e701a..bed4d840062f7 100644
--- a/paddle/phi/kernels/gpu/dirichlet_kernel.cu
+++ b/paddle/phi/kernels/gpu/dirichlet_kernel.cu
@@ -25,6 +25,8 @@
 
 #ifdef PADDLE_WITH_CUDA
 #include <curand_kernel.h>
+#elif defined(PADDLE_WITH_MUSA)
+#include <murand_kernel.h>
 #endif
 #ifdef PADDLE_WITH_HIP
 #include <hiprand_kernel.h>
@@ -40,6 +42,11 @@ using COMPAT_RANDSTATEPHILOX4_32_10_T = hiprandStatePhilox4_32_10_t;
 #define COMPAT_RAND_INIT hiprand_init
 #define COMPAT_RAND_UNIFORM hiprand_uniform
 #define COMPAT_RAND_NORMAL hiprand_normal
+#elif defined(PADDLE_WITH_MUSA)
+using COMPAT_RANDSTATEPHILOX4_32_10_T = murand_state_philox4x32_10_t;
+#define COMPAT_RAND_INIT murand_init
+#define COMPAT_RAND_UNIFORM murand_uniform
+#define COMPAT_RAND_NORMAL murand_normal
 #endif
 
 namespace phi {
diff --git a/paddle/phi/kernels/gpu/embedding_grad_kernel.cu b/paddle/phi/kernels/gpu/embedding_grad_kernel.cu
index 99ba12b1d6213..5fdf63083896e 100644
--- a/paddle/phi/kernels/gpu/embedding_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/embedding_grad_kernel.cu
@@ -99,6 +99,9 @@ struct EmbeddingGradCUDAFunctor {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx_.stream()));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          musaMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx_.stream()));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx_.stream()));
diff --git a/paddle/phi/kernels/gpu/graph_reindex_kernel.cu b/paddle/phi/kernels/gpu/graph_reindex_kernel.cu
index ac0dea5165379..966d018feb97f 100644
--- a/paddle/phi/kernels/gpu/graph_reindex_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_reindex_kernel.cu
@@ -61,7 +61,7 @@ std::shared_ptr<phi::Allocation> FillHashTable(const Context& dev_ctx,
 #ifdef PADDLE_WITH_HIP
   int block = 256;
 #else
-  int block = 1024;
+  int block = 1024; // CUDA & MUSA
 #endif
   int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
   int grid_tmp = (num_input + block - 1) / block;
@@ -76,6 +76,8 @@ std::shared_ptr<phi::Allocation> FillHashTable(const Context& dev_ctx,
   int* item_count_ptr = reinterpret_cast<int*>(item_count->ptr());
 #ifdef PADDLE_WITH_HIP
   hipMemset(item_count_ptr, 0, sizeof(int) * (num_input + 1));
+#elif defined(PADDLE_WITH_MUSA)
+  musaMemset(item_count_ptr, 0, sizeof(int) * (num_input + 1));
 #else
   cudaMemset(item_count_ptr, 0, sizeof(int) * (num_input + 1));
 #endif
@@ -97,6 +99,11 @@ std::shared_ptr<phi::Allocation> FillHashTable(const Context& dev_ctx,
             item_count_ptr + num_input,
             sizeof(int),
             hipMemcpyDeviceToHost);
+#elif defined(PADDLE_WITH_MUSA)
+  musaMemcpy(&total_unique_items,
+             item_count_ptr + num_input,
+             sizeof(int),
+             musaMemcpyDeviceToHost);
 #else
   cudaMemcpy(&total_unique_items,
              item_count_ptr + num_input,
@@ -131,7 +138,7 @@ void FillBufferHashTable(const Context& dev_ctx,
 #ifdef PADDLE_WITH_HIP
   int block = 256;
 #else
-  int block = 1024;
+  int block = 1024; // CUDA & MUSA
 #endif
   int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
   int grid_tmp = (num_input + block - 1) / block;
@@ -170,7 +177,7 @@ void ResetBufferHashTable(const Context& dev_ctx,
 #ifdef PADDLE_WITH_HIP
   int block = 256;
 #else
-  int block = 1024;
+  int block = 1024; // CUDA & MUSA
 #endif
   int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
   int grid_tmp = (unique_items->size() + block - 1) / block;
@@ -193,7 +200,7 @@ void ReindexSrc(const Context& dev_ctx,
 #ifdef PADDLE_WITH_HIP
   int block = 256;
 #else
-  int block = 1024;
+  int block = 1024; // CUDA & MUSA
 #endif
   int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
   int grid_tmp = (num_edges + block - 1) / block;
@@ -293,7 +300,7 @@ void BufferReindex(const Context& dev_ctx,
 #ifdef PADDLE_WITH_HIP
   int block = 256;
 #else
-  int block = 1024;
+  int block = 1024; // CUDA & MUSA
 #endif
   int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
   int grid_tmp = (num_edges + block - 1) / block;
@@ -364,6 +371,11 @@ void ReindexDst(const Context& dev_ctx,
               thrust::raw_pointer_cast(dst_ptr.data()) + node_len,
               sizeof(int),
               hipMemcpyDeviceToHost);
+#elif defined(PADDLE_WITH_MUSA)
+    musaMemcpy(&count_i,
+               thrust::raw_pointer_cast(dst_ptr.data()) + node_len,
+               sizeof(int),
+               musaMemcpyDeviceToHost);
 #else
     cudaMemcpy(&count_i,
                thrust::raw_pointer_cast(dst_ptr.data()) + node_len,
diff --git a/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu b/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu
index c01a8ea9d2e01..416352d5cb6ea 100644
--- a/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu
@@ -22,6 +22,9 @@
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #include <hiprand_kernel.h>
+#elif defined(PADDLE_WITH_MUSA)
+#include <musa_runtime.h>
+#include <murand_kernel.h>
 #else
 #include <cuda_runtime.h>
 #include <curand_kernel.h>
@@ -82,6 +85,12 @@ __global__ void SampleKernel(const uint64_t rand_seed,
                threadIdx.y * CTA_SIZE + threadIdx.x,
                0,
                &rng);
+#elif defined(PADDLE_WITH_MUSA)
+  murand_state_philox4x32_10 rng;
+  murand_init(rand_seed * gridDim.x + blockIdx.x,
+              threadIdx.y * CTA_SIZE + threadIdx.x,
+              0,
+              &rng);
 #else
   curandStatePhilox4_32_10_t rng;
   curand_init(rand_seed * gridDim.x + blockIdx.x,
@@ -118,6 +127,8 @@ __global__ void SampleKernel(const uint64_t rand_seed,
       for (int idx = k + threadIdx.x; idx < deg; idx += CTA_SIZE) {
 #ifdef PADDLE_WITH_HIP
         const int num = hiprand(&rng) % (idx + 1);
+#elif defined(PADDLE_WITH_MUSA)
+        const int num = murand(&rng) % (idx + 1);
 #else
         const int num = curand(&rng) % (idx + 1);
 #endif
@@ -218,6 +229,10 @@ __global__ void FisherYatesSampleKernel(const uint64_t rand_seed,
   hiprandState rng;
   hiprand_init(
       rand_seed * gridDim.x + blockIdx.x, threadIdx.y + threadIdx.x, 0, &rng);
+#elif defined(PADDLE_WITH_MUSA)
+  murand_state_philox4_32_10 rng;
+  murand_init(
+      rand_seed * gridDim.x + blockIdx.x, threadIdx.y + threadIdx.x, 0, &rng);
 #else
   curandStatePhilox4_32_10_t rng;
   curand_init(
@@ -242,6 +257,8 @@ __global__ void FisherYatesSampleKernel(const uint64_t rand_seed,
       for (int idx = split + threadIdx.x; idx <= deg - 1; idx += CTA_SIZE) {
 #ifdef PADDLE_WITH_HIP
         const int num = hiprand(&rng) % (idx + 1);
+#elif defined(PADDLE_WITH_MUSA)
+        const int num = murand(&rng) % (idx + 1);
 #else
         const int num = curand(&rng) % (idx + 1);
 #endif
diff --git a/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h b/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h
index bff91078865d9..3c2e4fa856a6a 100644
--- a/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h
+++ b/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h
@@ -42,6 +42,15 @@ inline void CopyBCastOff(const BroadCastInfo& bcast_info,
             bcast_info.r_offset.data(),
             sizeof(int64_t) * bcast_info.out_len,
             hipMemcpyHostToDevice);
+#elif defined(PADDLE_WITH_MUSA)
+  musaMemcpy(thrust::raw_pointer_cast(l_bcastoff->data()),
+             bcast_info.l_offset.data(),
+             sizeof(int64_t) * bcast_info.out_len,
+             musaMemcpyHostToDevice);
+  musaMemcpy(thrust::raw_pointer_cast(r_bcastoff->data()),
+             bcast_info.r_offset.data(),
+             sizeof(int64_t) * bcast_info.out_len,
+             musaMemcpyHostToDevice);
 #else
   cudaMemcpy(thrust::raw_pointer_cast(l_bcastoff->data()),
              bcast_info.l_offset.data(),
diff --git a/paddle/phi/kernels/gpu/group_norm_kernel.cu b/paddle/phi/kernels/gpu/group_norm_kernel.cu
index ef39abd939410..5b0dda3030cf1 100644
--- a/paddle/phi/kernels/gpu/group_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/group_norm_kernel.cu
@@ -300,6 +300,9 @@ void GroupNormDirectCUDAFunctor<T, AccT>::operator()(
 #ifdef PADDLE_WITH_HIP
     hipMemset(mean, 0, sizeof(AccT) * input_ddim[0] * groups);
     hipMemset(temp_variance, 0, sizeof(AccT) * input_ddim[0] * groups);
+#elif defined(PADDLE_WITH_MUSA)
+    musaMemset(mean, 0, sizeof(AccT) * input_ddim[0] * groups);
+    musaMemset(temp_variance, 0, sizeof(AccT) * input_ddim[0] * groups);
 #else
     cudaMemset(mean, 0, sizeof(AccT) * input_ddim[0] * groups);
     cudaMemset(temp_variance, 0, sizeof(AccT) * input_ddim[0] * groups);
diff --git a/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
index 0f17a1bcc318a..14be4ee79d142 100644
--- a/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
@@ -401,6 +401,14 @@ void InstanceNormGradKernel(const Context &dev_ctx,
       phi::dynload::miopenCreateTensorDescriptor(&data_desc_));
   PADDLE_ENFORCE_GPU_SUCCESS(
       phi::dynload::miopenCreateTensorDescriptor(&in_param_desc_));
+#elif defined(PADDLE_WITH_MUSA)
+  mudnnTensorDescriptor_t data_desc_;
+  mudnnTensorDescriptor_t in_param_desc_;
+
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::mudnnCreateTensorDescriptor(&data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::mudnnCreateTensorDescriptor(&in_param_desc_));
 #else
   cudnnTensorDescriptor_t data_desc_;
   cudnnTensorDescriptor_t in_param_desc_;
@@ -427,6 +435,15 @@ void InstanceNormGradKernel(const Context &dev_ctx,
       const_cast<int *>(strides.data())));
   PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDeriveBNTensorDescriptor(
       in_param_desc_, data_desc_, miopenBNSpatial));
+#elif defined(PADDLE_WITH_MUSA)
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnSetTensorDescriptor(
+      data_desc_,
+      CudnnDataType<T>::type,
+      x_dims.size() > 3 ? x_dims.size() : 4,
+      const_cast<int *>(dims.data()),
+      const_cast<int *>(strides.data())));
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnDeriveBNTensorDescriptor(
+      in_param_desc_, data_desc_, miopenBNSpatial));
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensorNdDescriptor(
       data_desc_,
@@ -464,9 +481,14 @@ void InstanceNormGradKernel(const Context &dev_ctx,
         epsilon,
         saved_mean_data,
         saved_var_data));
+#else
+#ifdef PADDLE_WITH_MUSA
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnBatchNormalizationBackward(
+        dev_ctx.mudnn_handle(),
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnBatchNormalizationBackward(
         dev_ctx.cudnn_handle(),
+#endif
         CUDNN_BATCHNORM_SPATIAL,
         CudnnDataType<T>::kOne(),
         CudnnDataType<T>::kZero(),
@@ -511,6 +533,11 @@ void InstanceNormGradKernel(const Context &dev_ctx,
       phi::dynload::miopenDestroyTensorDescriptor(data_desc_));
   PADDLE_ENFORCE_GPU_SUCCESS(
       phi::dynload::miopenDestroyTensorDescriptor(in_param_desc_));
+#elif defined(PADDLE_WITH_MUSA)
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::mudnnDestroyTensorDescriptor(data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::mudnnDestroyTensorDescriptor(in_param_desc_));
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(
       phi::dynload::cudnnDestroyTensorDescriptor(data_desc_));
@@ -659,7 +686,7 @@ PD_REGISTER_KERNEL(instance_norm_double_grad,
                    double,
                    phi::dtype::float16,
                    phi::dtype::bfloat16) {}
-#else
+#else // CUDA & MUSA
 PD_REGISTER_KERNEL(instance_norm_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/instance_norm_kernel.cu b/paddle/phi/kernels/gpu/instance_norm_kernel.cu
index 7f10eac67c67c..51339ea33d36b 100644
--- a/paddle/phi/kernels/gpu/instance_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/instance_norm_kernel.cu
@@ -69,6 +69,14 @@ void InstanceNormKernel(const Context &dev_ctx,
       phi::dynload::miopenCreateTensorDescriptor(&data_desc_));
   PADDLE_ENFORCE_GPU_SUCCESS(
       phi::dynload::miopenCreateTensorDescriptor(&in_param_desc_));
+#elif defined(PADDLE_WITH_MUSA)
+  mudnnTensorDescriptor_t data_desc_;
+  mudnnTensorDescriptor_t in_param_desc_;
+
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::mudnnCreateTensorDescriptor(&data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::mudnnCreateTensorDescriptor(&in_param_desc_));
 #else
   cudnnTensorDescriptor_t data_desc_;
   cudnnTensorDescriptor_t in_param_desc_;
@@ -100,6 +108,15 @@ void InstanceNormKernel(const Context &dev_ctx,
       const_cast<int *>(strides.data())));
   PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDeriveBNTensorDescriptor(
       in_param_desc_, data_desc_, miopenBNSpatial));
+#elif defined(PADDLE_WITH_MUSA)
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnSetTensorDescriptor(
+      data_desc_,
+      CudnnDataType<T>::type,
+      x_dims.size() > 3 ? x_dims.size() : 4,
+      const_cast<int *>(dims.data()),
+      const_cast<int *>(strides.data())));
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnDeriveBNTensorDescriptor(
+      in_param_desc_, data_desc_, mudnnBNSpatial));
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensorNdDescriptor(
       data_desc_,
@@ -198,7 +215,11 @@ void InstanceNormKernel(const Context &dev_ctx,
       phi::dynload::miopenDestroyTensorDescriptor(in_param_desc_));
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(
+#ifdef PADDLE_WITH_MUSA
+      phi::dynload::mudnnBatchNormalizationForwardTraining(
+#else
       phi::dynload::cudnnBatchNormalizationForwardTraining(
+#endif
           handle,
           CUDNN_BATCHNORM_SPATIAL,
           CudnnDataType<T>::kOne(),
@@ -217,11 +238,18 @@ void InstanceNormKernel(const Context &dev_ctx,
           saved_mean_data,
           saved_variance_data));
 
+#ifdef PADDLE_WITH_MUSA
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::mudnnDestroyTensorDescriptor(data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::mudnnDestroyTensorDescriptor(in_param_desc_));
+#else
   PADDLE_ENFORCE_GPU_SUCCESS(
       phi::dynload::cudnnDestroyTensorDescriptor(data_desc_));
   PADDLE_ENFORCE_GPU_SUCCESS(
       phi::dynload::cudnnDestroyTensorDescriptor(in_param_desc_));
 #endif
+#endif
 }
 
 }  // namespace phi
@@ -243,7 +271,7 @@ PD_REGISTER_KERNEL(instance_norm,
                    double,
                    phi::dtype::float16,
                    phi::dtype::bfloat16) {}
-#else
+#else // CUDA & MUSA
 PD_REGISTER_KERNEL(instance_norm,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu
index e8fc640cdd508..e1c660e674427 100644
--- a/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu
@@ -137,7 +137,7 @@ PD_REGISTER_KERNEL(layer_norm_grad,
     kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
   }
 }
-#else
+#else // CUDA & MUSA
 PD_REGISTER_KERNEL(layer_norm_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/layer_norm_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
index 34425d8cfcfe2..336a655d9c8fa 100644
--- a/paddle/phi/kernels/gpu/layer_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
@@ -482,7 +482,7 @@ void LayerNormDirectCUDAFunctor<T, U>::operator()(gpuStream_t stream,
 
 template class LayerNormDirectCUDAFunctor<float, float>;
 template class LayerNormDirectCUDAFunctor<double, double>;
-#if defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_MUSA)) && !defined(PADDLE_WITH_HIP)
 template class LayerNormDirectCUDAFunctor<half, float>;
 #endif
 
@@ -689,7 +689,7 @@ PD_REGISTER_KERNEL(layer_norm,
   kernel->OutputAt(1).SetDataType(phi::DataType::UNDEFINED);
   kernel->OutputAt(2).SetDataType(phi::DataType::UNDEFINED);
 }
-#else
+#else // CUDA & MUSA
 PD_REGISTER_KERNEL(layer_norm,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu
index f6a5b26960a62..1b0bfaea403c3 100644
--- a/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu
@@ -48,6 +48,13 @@ PD_REGISTER_KERNEL(log_softmax_grad,
                    float,
                    phi::dtype::float16,
                    phi::dtype::bfloat16) {}
+#elif defined(PADDLE_WITH_MUSA)
+PD_REGISTER_KERNEL(log_softmax_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LogSoftmaxGradKernel,
+                   float,
+                   phi::dtype::float16) {}
 #else
 PD_REGISTER_KERNEL(log_softmax_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/log_softmax_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_kernel.cu
index 6dfe3d2b6173d..7b6ffe2d0cfd4 100644
--- a/paddle/phi/kernels/gpu/log_softmax_kernel.cu
+++ b/paddle/phi/kernels/gpu/log_softmax_kernel.cu
@@ -47,6 +47,13 @@ PD_REGISTER_KERNEL(log_softmax,
                    float,
                    phi::dtype::float16,
                    phi::dtype::bfloat16) {}
+#elif defined(PADDLE_WITH_MUSA)
+PD_REGISTER_KERNEL(log_softmax,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LogSoftmaxKernel,
+                   float,
+                   phi::dtype::float16) {}
 #else
 PD_REGISTER_KERNEL(log_softmax,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/logcumsumexp_grad_kernel.cu b/paddle/phi/kernels/gpu/logcumsumexp_grad_kernel.cu
index 4f4ee36892d62..f02f47edc4e28 100644
--- a/paddle/phi/kernels/gpu/logcumsumexp_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/logcumsumexp_grad_kernel.cu
@@ -20,7 +20,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/logcumsumexp_grad_impl.h"
 
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(logcumsumexp_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/logsumexp_function.cu.h b/paddle/phi/kernels/gpu/logsumexp_function.cu.h
index 53b6fb6d2b20d..76d0b294f397b 100644
--- a/paddle/phi/kernels/gpu/logsumexp_function.cu.h
+++ b/paddle/phi/kernels/gpu/logsumexp_function.cu.h
@@ -46,7 +46,7 @@ __inline__ __device__ T WarpAllReduce(T val) {
   for (int mask = ThreadGroupWidth / 2; mask > 0; mask /= 2) {
 #if PADDLE_WITH_HIP
     val = Functor<T>()(val, __shfl_xor(0xffffffff, val, mask));
-#else
+#else // CUDA & MUSA
     val = Functor<T>()(val, __shfl_xor_sync(0xffffffff, val, mask));
 #endif
   }
@@ -69,6 +69,22 @@ inline void GetNumBlocks(int64_t block_size,
   *num_blocks = std::max<int>(
       1, std::min<int64_t>(max_blocks, sm_count * tpm / block_size * waves));
 }
+#elif defined(PADDLE_WITH_MUSA)
+inline void GetNumBlocks(int64_t block_size,
+                         int64_t max_blocks,
+                         int64_t waves,
+                         int* num_blocks) {
+  int dev;
+  PADDLE_ENFORCE_GPU_SUCCESS(musaGetDevice(&dev));
+  int sm_count;
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      cudaDeviceGetAttribute(&sm_count, musaDevAttrMultiProcessorCount, dev));
+  int tpm;
+  PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceGetAttribute(
+      &tpm, musaDevAttrMaxThreadsPerMultiProcessor, dev));
+  *num_blocks = std::max<int>(
+      1, std::min<int64_t>(max_blocks, sm_count * tpm / block_size * waves));
+}
 #else
 inline void GetNumBlocks(int64_t block_size,
                          int64_t max_blocks,
@@ -193,6 +209,12 @@ inline hipError_t LaunchLogsumexpWarp(const Context& dev_ctx,
                                       const int64_t num_col,
                                       const SourceType* in,
                                       SourceType* out) {
+#elif defined(PADDLE_WITH_MUSA)
+inline musaError_t LaunchLogsumexpWarp(const Context& dev_ctx,
+                                       const int64_t num_row,
+                                       const int64_t num_col,
+                                       const SourceType* in,
+                                       SourceType* out) {
 #else
 inline cudaError_t LaunchLogsumexpWarp(const Context& dev_ctx,
                                        const int64_t num_row,
@@ -222,6 +244,8 @@ inline cudaError_t LaunchLogsumexpWarp(const Context& dev_ctx,
           dev_ctx, num_row, num_col, in, out);
 #if PADDLE_WITH_HIP
   return hipPeekAtLastError();
+#elif defined(PADDLE_WITH_MUSA)
+  return musaPeekAtLastError();
 #else
   return cudaPeekAtLastError();
 #endif
@@ -240,6 +264,12 @@ inline hipError_t DispatchLogsumexpWarpWithPadding(const Context& dev_ctx,
                                                    const int64_t num_col,
                                                    const SourceType* in,
                                                    SourceType* out) {
+#elif defined(PADDLE_WITH_MUSA)
+inline musaError_t DispatchLogsumexpWarpWithPadding(const Context& dev_ctx,
+                                                    const int64_t num_row,
+                                                    const int64_t num_col,
+                                                    const SourceType* in,
+                                                    SourceType* out) {
 #else
 inline cudaError_t DispatchLogsumexpWarpWithPadding(const Context& dev_ctx,
                                                     const int64_t num_row,
@@ -287,6 +317,8 @@ DispatchLogsumexpWarpCols(const Context& dev_ctx,
   if (num_col <= 0) {
 #if PADDLE_WITH_HIP
     return hipErrorInvalidValue;
+#elif defined(PADDLE_WITH_MUSA)
+    return musaErrorInvalidValue;
 #else
     return cudaErrorInvalidValue;
 #endif
@@ -367,6 +399,8 @@ DispatchLogsumexpWarpCols(const Context& dev_ctx,
 #undef HANDLE_COL
 #if PADDLE_WITH_HIP
   return hipErrorInvalidValue;
+#elif defined(PADDLE_WITH_MUSA)
+  return musaErrorInvalidValue;
 #else
   return cudaErrorInvalidValue;
 #endif
@@ -391,6 +425,8 @@ DispatchLogsumexpWarpCols(const Context& dev_ctx,
   if (num_col <= 0) {
 #if PADDLE_WITH_HIP
     return hipErrorInvalidValue;
+#elif defined(PADDLE_WITH_MUSA)
+    return musaErrorInvalidValue;
 #else
     return cudaErrorInvalidValue;
 #endif
@@ -455,6 +491,8 @@ DispatchLogsumexpWarpCols(const Context& dev_ctx,
 #undef HANDLE_COL
 #if PADDLE_WITH_HIP
   return hipErrorInvalidValue;
+#elif defined(PADDLE_WITH_MUSA)
+  return musaErrorInvalidValue;
 #else
   return cudaErrorInvalidValue;
 #endif
diff --git a/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu b/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu
index 7895983236f91..4e5a2942d6b3b 100644
--- a/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu
@@ -36,6 +36,8 @@ void NllLossGradKernel(const Context& dev_ctx,
   auto total_weight_data = total_weight.data<T>();
 #ifdef PADDLE_WITH_HIP
   hipMemset(dx_data, 0, dx->numel() * sizeof(T));
+#elif defined(PADDLE_WITH_MUSA)
+  musaMemset(dx_data, 0, dx->numel() * sizeof(T));
 #else
   cudaMemset(dx_data, 0, dx->numel() * sizeof(T));
 #endif
diff --git a/paddle/phi/kernels/gpu/nll_loss_kernel.cu b/paddle/phi/kernels/gpu/nll_loss_kernel.cu
index 1e80eb9bb460e..5d9aec594089d 100644
--- a/paddle/phi/kernels/gpu/nll_loss_kernel.cu
+++ b/paddle/phi/kernels/gpu/nll_loss_kernel.cu
@@ -37,6 +37,8 @@ void NllLossRawKernel(const Context& dev_ctx,
   auto weight_data = weight.get_ptr() ? weight.get_ptr()->data<T>() : nullptr;
 #ifdef PADDLE_WITH_HIP
   hipMemset(total_weight_data, 0, sizeof(T));
+#elif defined(PADDLE_WITH_MUSA)
+  musaMemset(total_weight_data, 0, sizeof(T));
 #else
   cudaMemset(total_weight_data, 0, sizeof(T));
 #endif
diff --git a/paddle/phi/kernels/gpu/rnn_functor.h b/paddle/phi/kernels/gpu/rnn_functor.h
index fc27258981d39..e351c29138ee8 100644
--- a/paddle/phi/kernels/gpu/rnn_functor.h
+++ b/paddle/phi/kernels/gpu/rnn_functor.h
@@ -25,6 +25,10 @@ namespace phi {
 using gpuRNNMode_t = miopenRNNMode_t;
 using gpuDnnHandle_t = miopenHandle_t;
 using gpuDnnDataType_t = miopenDataType_t;
+#elif defined(PADDLE_WITH_MUSA)
+using gpuRNNMode_t = mudnnRNNMode_t;
+using gpuDnnHandle_t = mudnnHandle_t;
+using gpuDnnDataType_t = mudnnDataType_t;
 #else
 using gpuRNNMode_t = cudnnRNNMode_t;
 using gpuDnnHandle_t = cudnnHandle_t;
@@ -103,6 +107,9 @@ class RNNDescriptors {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
           phi::dynload::miopenDropoutGetStatesSize(handle, &state_size));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          phi::dynload::mudnnDropoutGetStatesSize(handle, &state_size));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
           phi::dynload::cudnnDropoutGetStatesSize(handle, &state_size));
@@ -143,8 +150,12 @@ class RNNDescriptors {
         mode_,
         CUDNN_RNN_ALGO_STANDARD,
         cudnn_type));
+#else
+#ifdef PADDLE_WITH_MUSA
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnSetRNNDescriptor(
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetRNNDescriptor(
+#endif
         rnn_desc_.desc(),
         hidden_size_,
         num_layers_,
@@ -167,6 +178,9 @@ class RNNDescriptors {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenGetRNNParamsSize(
         handle, rnn_desc_.desc(), x_descs_[0], &weights_size_, cudnn_type));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnGetRNNParamsSize(
+        handle, rnn_desc_.desc(), x_descs_[0], &weights_size_, cudnn_type));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnGetRNNParamsSize(
         handle, rnn_desc_.desc(), x_descs_[0], &weights_size_, cudnn_type));
@@ -191,6 +205,15 @@ class RNNDescriptors {
                                                 workspace_size));
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenGetRNNTrainingReserveSize(
         handle, rnn_desc_.desc(), seq_length_, x_descs_.data(), reserve_size));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::mudnnGetRNNWorkspaceSize(handle,
+                                               rnn_desc_.desc(),
+                                               seq_length_,
+                                               x_descs_.data(),
+                                               workspace_size));
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnGetRNNTrainingReserveSize(
+        handle, rnn_desc_.desc(), seq_length_, x_descs_.data(), reserve_size));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(
         phi::dynload::cudnnGetRNNWorkspaceSize(handle,
@@ -212,6 +235,16 @@ class RNNDescriptors {
   miopenRNNDescriptor_t rnn_desc() { return rnn_desc_.desc(); }
   miopenDropoutDescriptor_t dropout_desc() { return dropout_desc_.desc(); }
   miopenTensorDescriptor_t weight_desc() { return weight_desc_.desc(); }
+#elif defined(PADDLE_WITH_MUSA)
+  mudnnTensorDescriptor_t *x_descs() { return x_descs_.data(); }
+  mudnnTensorDescriptor_t *y_descs() { return y_descs_.data(); }
+  mudnnTensorDescriptor_t init_h_desc() { return init_h_desc_.desc(); }
+  mudnnTensorDescriptor_t init_c_desc() { return init_c_desc_.desc(); }
+  mudnnTensorDescriptor_t last_h_desc() { return last_h_desc_.desc(); }
+  mudnnTensorDescriptor_t last_c_desc() { return last_c_desc_.desc(); }
+  mudnnRNNDescriptor_t rnn_desc() { return rnn_desc_.desc(); }
+  mudnnDropoutDescriptor_t dropout_desc() { return dropout_desc_.desc(); }
+  mudnnTensorDescriptor_t weight_desc() { return weight_desc_.desc(); }
 #else
   cudnnTensorDescriptor_t *x_descs() { return x_descs_.data(); }
   cudnnTensorDescriptor_t *y_descs() { return y_descs_.data(); }
@@ -243,6 +276,9 @@ class RNNDescriptors {
 #ifdef PADDLE_WITH_HIP
   std::vector<miopenTensorDescriptor_t> x_descs_;
   std::vector<miopenTensorDescriptor_t> y_descs_;
+#elif defined(PADDLE_WITH_HIP)
+  std::vector<mudnnTensorDescriptor_t> x_descs_;
+  std::vector<mudnnTensorDescriptor_t> y_descs_;
 #else
   std::vector<cudnnTensorDescriptor_t> x_descs_;
   std::vector<cudnnTensorDescriptor_t> y_descs_;
diff --git a/paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc b/paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc
index 910c8e8b6a57a..44bca2124770a 100644
--- a/paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc
+++ b/paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc
@@ -105,6 +105,16 @@ void RnnGradKernel(const Context &dev_ctx,
     rnn_mode = miopenRNNRELU;
   else if (mode == "RNN_TANH")
     rnn_mode = miopenRNNTANH;
+#elif defined(PADDLE_WITH_MUSA)
+  mudnnRNNMode_t rnn_mode = MUDNN_LSTM;
+  if (mode == "LSTM")
+    rnn_mode = MUDNN_LSTM;
+  else if (mode == "GRU")
+    rnn_mode = MUDNN_GRU;
+  else if (mode == "RNN_RELU")
+    rnn_mode = MUDNN_RNN_RELU;
+  else if (mode == "RNN_TANH")
+    rnn_mode = MUDNN_RNN_TANH;
 #else
   cudnnRNNMode_t rnn_mode = CUDNN_LSTM;
   if (mode == "LSTM")
@@ -195,6 +205,8 @@ void RnnGradKernel(const Context &dev_ctx,
   T *init_c_grad_data = nullptr;
 #ifdef PADDLE_WITH_HIP
   if (rnn_mode == miopenLSTM) {
+#elif defined(PADDLE_WITH_MUSA)
+  if (rnn_mode == MUDNN_LSTM)
 #else
   if (rnn_mode == CUDNN_LSTM) {
 #endif
@@ -341,8 +353,12 @@ void RnnGradKernel(const Context &dev_ctx,
       // permute weight grad list from weight grad tensor
       TensorToPermutedWeight<T>(
           place, stream, weight_grad, &weight_grad_list, rnn_mode, is_bidirec);
+#else
+#ifdef PADDLE_WITH_MUSA
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnRNNBackwardWeights(
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNBackwardWeights(
+#endif
           handle,
           rnn.rnn_desc(),
           seq_length,
diff --git a/paddle/phi/kernels/gpu/rnn_kernel.cu.cc b/paddle/phi/kernels/gpu/rnn_kernel.cu.cc
index c1ed3f16e0584..601c1a524c402 100644
--- a/paddle/phi/kernels/gpu/rnn_kernel.cu.cc
+++ b/paddle/phi/kernels/gpu/rnn_kernel.cu.cc
@@ -65,7 +65,11 @@ void RNNInferece(bool has_seq_length,
                                                 workspace_size));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(
+#ifdef PADDLE_WITH_MUSA
+        phi::dynload::mudnnRNNForwardInference(handle,
+#else
         phi::dynload::cudnnRNNForwardInference(handle,
+#endif
                                                rnn->rnn_desc(),
                                                seq_length,
                                                rnn->x_descs(),
@@ -154,6 +158,16 @@ void RnnKernel(const Context &dev_ctx,
     rnn_mode = miopenRNNRELU;
   else if (mode == "RNN_TANH")
     rnn_mode = miopenRNNTANH;
+#elif defined(PADDLE_WITH_MUSA)
+  gpuRNNMode_t rnn_mode = MUDNN_LSTM;
+  if (mode == "LSTM")
+    rnn_mode = MUDNN_LSTM;
+  else if (mode == "GRU")
+    rnn_mode = MUDNN_GRU;
+  else if (mode == "RNN_RELU")
+    rnn_mode = MUDNN_RNN_RELU;
+  else if (mode == "RNN_TANH")
+    rnn_mode = MUDNN_RNN_TANH;
 #else
   gpuRNNMode_t rnn_mode = CUDNN_LSTM;
   if (mode == "LSTM")
@@ -188,6 +202,8 @@ void RnnKernel(const Context &dev_ctx,
   T *last_c_data = nullptr;
 #ifdef PADDLE_WITH_HIP
   if (rnn_mode == miopenLSTM) {
+#elif defined(PADDLE_WITH_MUSA)
+  if (rnn_mode == MUDNN_LSTM) {
 #else
   if (rnn_mode == CUDNN_LSTM) {
 #endif
@@ -333,7 +349,11 @@ void RnnKernel(const Context &dev_ctx,
           reserve_size));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
+#ifdef PADDLE_WITH_MUSA
+          phi::dynload::mudnnRNNForwardTraining(handle,
+#else
           phi::dynload::cudnnRNNForwardTraining(handle,
+#endif
                                                 rnn.rnn_desc(),
                                                 seq_length,
                                                 rnn.x_descs(),
@@ -405,7 +425,7 @@ void RnnKernel(const Context &dev_ctx,
 PD_REGISTER_KERNEL(rnn, GPU, ALL_LAYOUT, phi::RnnKernel, float) {
   kernel->OutputAt(1).SetDataType(phi::DataType::UINT8);
 }
-#else
+#else // CUDA & MUSA
 PD_REGISTER_KERNEL(rnn, GPU, ALL_LAYOUT, phi::RnnKernel, float, double) {
   kernel->OutputAt(1).SetDataType(phi::DataType::UINT8);
 }
diff --git a/paddle/phi/kernels/gpu/send_u_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/send_u_recv_grad_kernel.cu
index a7e4e32ed1d17..58cf7a273f540 100644
--- a/paddle/phi/kernels/gpu/send_u_recv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/send_u_recv_grad_kernel.cu
@@ -49,6 +49,8 @@ void GraphSendRecvGradOpCUDAKernelLaunchHelper(
 
 #ifdef PADDLE_WITH_HIP
   hipMemset(p_output, 0, memset_bytes);
+#elif defined(PADDLE_WITH_MUSA)
+  musaMemset(p_output, 0, memset_bytes);
 #else
   cudaMemset(p_output, 0, memset_bytes);
 #endif
diff --git a/paddle/phi/kernels/gpu/send_u_recv_kernel.cu b/paddle/phi/kernels/gpu/send_u_recv_kernel.cu
index 85cc80e36b517..3aa20279bdd29 100644
--- a/paddle/phi/kernels/gpu/send_u_recv_kernel.cu
+++ b/paddle/phi/kernels/gpu/send_u_recv_kernel.cu
@@ -63,6 +63,8 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx,
   if (reduce_op == "SUM" || reduce_op == "MEAN") {
 #ifdef PADDLE_WITH_HIP
     hipMemset(p_output, 0, memset_bytes);
+#elif defined(PADDLE_WITH_MUSA)
+    musaMemset(p_output, 0, memset_bytes);
 #else
     cudaMemset(p_output, 0, memset_bytes);
 #endif
@@ -138,6 +140,8 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx,
 
 #ifdef PADDLE_WITH_HIP
     hipMemset(p_dst_count, 0, input_size * sizeof(int));
+#elif defined(PADDLE_WITH_MUSA)
+    musaMemset(p_dst_count, 0, input_size * sizeof(int));
 #else
     cudaMemset(p_dst_count, 0, input_size * sizeof(int));
 #endif
diff --git a/paddle/phi/kernels/gpu/send_ue_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/send_ue_recv_grad_kernel.cu
index d368c43a29753..a1c2a0dcf2214 100644
--- a/paddle/phi/kernels/gpu/send_ue_recv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/send_ue_recv_grad_kernel.cu
@@ -166,6 +166,11 @@ void CalculateXGrad(const Context& ctx,
                   x_grad_out.data<T>(),
                   x_grad_out.numel() * sizeof(T),
                   hipMemcpyDeviceToDevice);
+#elif defined(PADDLE_WITH_MUSA)
+        musaMemcpy(x_grad,
+                   x_grad_out.data<T>(),
+                   x_grad_out.numel() * sizeof(T),
+                   musaMemcpyDeviceToDevice);
 #else
         cudaMemcpy(x_grad,
                    x_grad_out.data<T>(),
@@ -243,6 +248,11 @@ void CalculateXGrad(const Context& ctx,
                   x_grad_out.data<T>(),
                   x_grad_out.numel() * sizeof(T),
                   hipMemcpyDeviceToDevice);
+#elif defined(PADDLE_WITH_MUSA)
+        musaMemcpy(x_grad,
+                   x_grad_out.data<T>(),
+                   x_grad_out.numel() * sizeof(T),
+                   musaMemcpyDeviceToDevice);
 #else
         cudaMemcpy(x_grad,
                    x_grad_out.data<T>(),
@@ -289,6 +299,11 @@ void CalculateXGrad(const Context& ctx,
                   x_grad_out.data<T>(),
                   x_grad_out.numel() * sizeof(T),
                   hipMemcpyDeviceToDevice);
+#elif defined(PADDLE_WITH_MUSA)
+        musaMemcpy(x_grad,
+                   x_grad_out.data<T>(),
+                   x_grad_out.numel() * sizeof(T),
+                   musaMemcpyDeviceToDevice);
 #else
         cudaMemcpy(x_grad,
                    x_grad_out.data<T>(),
@@ -358,6 +373,11 @@ void CalculateXGrad(const Context& ctx,
                   x_grad_out.data<T>(),
                   x_grad_out.numel() * sizeof(T),
                   hipMemcpyDeviceToDevice);
+#elif defined(PADDLE_WITH_MUSA)
+        musaMemcpy(x_grad,
+                   x_grad_out.data<T>(),
+                   x_grad_out.numel() * sizeof(T),
+                   musaMemcpyDeviceToDevice);
 #else
         cudaMemcpy(x_grad,
                    x_grad_out.data<T>(),
@@ -493,6 +513,9 @@ void GraphSendUERecvGradOpCUDAKernelLaunchHelper(
 #ifdef PADDLE_WITH_HIP
   hipMemset(x_grad_data, 0, memset_bytes_x);
   hipMemset(e_grad_data, 0, memset_bytes_e);
+#elif defined(PADDLE_WITH_MUSA)
+  musaMemset(x_grad_data, 0, memset_bytes_x);
+  musaMemset(e_grad_data, 0, memset_bytes_e);
 #else
   cudaMemset(x_grad_data, 0, memset_bytes_x);
   cudaMemset(e_grad_data, 0, memset_bytes_e);
diff --git a/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu b/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu
index 764490bd1cb8b..33f7cbccd0f5e 100644
--- a/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu
+++ b/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu
@@ -61,6 +61,8 @@ void GraphSendUERecvOpCUDAKernelLaunchHelper(const Context& ctx,
   if (reduce_op == "SUM" || reduce_op == "MEAN") {
 #ifdef PADDLE_WITH_HIP
     hipMemset(out_data, 0, memset_bytes);
+#elif defined(PADDLE_WITH_MUSA)
+    musaMemset(out_data, 0, memset_bytes);
 #else
     cudaMemset(out_data, 0, memset_bytes);
 #endif
@@ -104,7 +106,7 @@ void GraphSendUERecvOpCUDAKernelLaunchHelper(const Context& ctx,
 #ifdef PADDLE_WITH_HIP
   int block_ = 256;
 #else
-  int block_ = 1024;
+  int block_ = 1024; // CUDA & MUSA
 #endif
   if (reduce_op == "SUM" || reduce_op == "MEAN") {
     GraphSendUERecvSumCUDAFunctor<T> sum_functor;
@@ -158,6 +160,8 @@ void GraphSendUERecvOpCUDAKernelLaunchHelper(const Context& ctx,
       int* dst_count_data = dst_count->data<int>();
 #ifdef PADDLE_WITH_HIP
       hipMemset(dst_count_data, 0, input_size * sizeof(int));
+#elif defined(PADDLE_WITH_HIP)
+      musaMemset(dst_count_data, 0, input_size * sizeof(int));
 #else
       cudaMemset(dst_count_data, 0, input_size * sizeof(int));
 #endif
diff --git a/paddle/phi/kernels/gpu/send_uv_grad_kernel.cu b/paddle/phi/kernels/gpu/send_uv_grad_kernel.cu
index c50b1960d0056..408f4bf26593c 100644
--- a/paddle/phi/kernels/gpu/send_uv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/send_uv_grad_kernel.cu
@@ -116,6 +116,11 @@ void CalculateGrad(const Context& ctx,
                 x_grad_out.data<T>(),
                 x_grad_out.numel() * sizeof(T),
                 hipMemcpyDeviceToDevice);
+#elif defined(PADDLE_WITH_MUSA)
+      musaMemcpy(x_grad,
+                 x_grad_out.data<T>(),
+                 x_grad_out.numel() * sizeof(T),
+                 musaMemcpyDeviceToDevice);
 #else
       cudaMemcpy(x_grad,
                  x_grad_out.data<T>(),
@@ -198,6 +203,11 @@ void CalculateGrad(const Context& ctx,
                 x_grad_out.data<T>(),
                 x_grad_out.numel() * sizeof(T),
                 hipMemcpyDeviceToDevice);
+#elif defined(PADDLE_WITH_MUSA)
+      musaMemcpy(x_grad,
+                 x_grad_out.data<T>(),
+                 x_grad_out.numel() * sizeof(T),
+                 musaMemcpyDeviceToDevice);
 #else
       cudaMemcpy(x_grad,
                  x_grad_out.data<T>(),
@@ -247,6 +257,9 @@ void GraphSendUVGradOpCUDAKernelLaunchHelper(const Context& ctx,
 #ifdef PADDLE_WITH_HIP
   hipMemset(x_grad_data, 0, memset_bytes_x);
   hipMemset(y_grad_data, 0, memset_bytes_y);
+#elif defined(PADDLE_WITH_MUSA)
+  musaMemset(x_grad_data, 0, memset_bytes_x);
+  musaMemset(y_grad_data, 0, memset_bytes_y);
 #else
   cudaMemset(x_grad_data, 0, memset_bytes_x);
   cudaMemset(y_grad_data, 0, memset_bytes_y);
diff --git a/paddle/phi/kernels/gpu/top_k_kernel.cu b/paddle/phi/kernels/gpu/top_k_kernel.cu
index bef328ec21a20..0bbbd079f9738 100644
--- a/paddle/phi/kernels/gpu/top_k_kernel.cu
+++ b/paddle/phi/kernels/gpu/top_k_kernel.cu
@@ -198,7 +198,7 @@ void TopkKernel(const Context& dev_ctx,
                                                       gridx,
                                                       input_height,
                                                       largest));
-#else
+#else // CUDA & MUSA
       FIXED_BLOCK_DIM(switch (phi::funcs::getMaxLength(k)) {
         FIXED_MAXLENGTH(
             phi::funcs::KeMatrixTopK<T, maxLength, kBlockDim>
@@ -307,7 +307,7 @@ void TopkKernel(const Context& dev_ctx,
                                                       gridx,
                                                       input_height,
                                                       largest));
-#else
+#else // CUDA & MUSA
       FIXED_BLOCK_DIM(switch (phi::funcs::getMaxLength(k)) {
         FIXED_MAXLENGTH(phi::funcs::KeMatrixTopK<T, maxLength, kBlockDim>
                         <<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
diff --git a/paddle/phi/kernels/gpudnn/affine_grid_grad_kernel.cu b/paddle/phi/kernels/gpudnn/affine_grid_grad_kernel.cu
index 2a3c9515ac2ea..255948e1f6570 100644
--- a/paddle/phi/kernels/gpudnn/affine_grid_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/affine_grid_grad_kernel.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP) || !defined(PADDLE_WITH_MUSA)
 
 #include "paddle/phi/kernels/affine_grid_grad_kernel.h"
 #include "paddle/phi/backends/all_context.h"
diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
index 2c6e898fa25c8..1b01f2b8131c9 100644
--- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
@@ -22,6 +22,8 @@
 #include "paddle/phi/core/kernel_registry.h"
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/kernels/gpudnn/conv_miopen_helper.h"
+#elif defined(PADDLE_WITH_MUSA)
+#include "paddle/phi/kernels/gpudnn/conv_mudnn_helper.h"
 #else
 #include "paddle/phi/kernels/gpudnn/conv_cudnn_v7.h"
 #endif
@@ -138,6 +140,9 @@ void ConvCudnnGradKernelImplV7(
 #ifdef PADDLE_WITH_HIP
   SearchResult<miopenConvBwdDataAlgorithm_t> bwd_result;
   SearchResult<miopenConvBwdWeightsAlgorithm_t> filter_result;
+#elif defined(PADDLE_WITH_MUSA)
+  SearchResult<mudnnConvBwdDataAlgorithm_t> bwd_result;
+  SearchResult<mudnnConvBwdWeightsAlgorithm_t> filter_result;
 #else
   SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result;
   SearchResult<cudnnConvolutionBwdFilterAlgo_t> filter_result;
@@ -146,7 +151,7 @@ void ConvCudnnGradKernelImplV7(
   int iwo_groups = groups;
   int c_groups = 1;
 
-#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
+#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) || defined(PADDLE_WITH_MUSA)
   iwo_groups = 1;
   c_groups = groups;
   groups = 1;
@@ -172,7 +177,7 @@ void ConvCudnnGradKernelImplV7(
     workspace_size = std::max(workspace_size, search1::GetWorkspaceSize(args1));
     bwd_result.algo = search1::Find<T>(
         args1, exhaustive_search, deterministic, workspace_size, ctx);
-#else
+#else // CUDA & MUSA
     using search1 = SearchAlgorithm<ConvKind::kBackwardData>;
     bwd_result = search1::Find<T>(ctx, args1, exhaustive_search, deterministic);
     workspace_size = std::max(workspace_size, bwd_result.workspace_size);
@@ -198,7 +203,7 @@ void ConvCudnnGradKernelImplV7(
     workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2));
     filter_result.algo = search2::Find<T>(
         args2, exhaustive_search, deterministic, workspace_size, ctx);
-#else
+#else // CUDA & MUSA
     using search2 = SearchAlgorithm<ConvKind::kBackwardFilter>;
     filter_result =
         search2::Find<T>(ctx, args2, exhaustive_search, deterministic);
@@ -213,7 +218,7 @@ void ConvCudnnGradKernelImplV7(
 #ifdef PADDLE_WITH_HIP
   // MIOPEN ONLY support beta to be 0.0f
   ScalingParamType<T> beta = 0.0f;
-#else
+#else // CUDA & MUSA
   ScalingParamType<T> beta = use_addto ? 1.0f : 0.0f;
 
 #endif
@@ -278,7 +283,7 @@ void ConvCudnnGradKernelImplV7(
           },
           workspace_size);
     }
-#else
+#else // CUDA & MUSA
     ConvRunner<T, ConvKind::kBackwardData>::Apply(ctx,
                                                   args1,
                                                   bwd_result,
@@ -318,7 +323,7 @@ void ConvCudnnGradKernelImplV7(
                   workspace_size));
         },
         workspace_size);
-#else
+#else // MUSA & CUDA
     ConvRunner<T, ConvKind::kBackwardFilter>::Apply(ctx,
                                                     args2,
                                                     filter_result,
@@ -455,7 +460,7 @@ void ConvCudnnGradKernel(const Context& ctx,
 #ifdef PADDLE_WITH_HIP
   // HIP MIOPEN ONLY SUPPORT NCHW format
   auto compute_format = phi::backends::gpu::DataLayout::kNCHW;
-#else
+#else // MUSA & CUDA
 #if CUDNN_VERSION_MIN(8, 1, 0)
   const bool compute_in_nhwc =
       (dtype == CUDNN_DATA_HALF || dtype == CUDNN_DATA_BFLOAT16) &&
@@ -1004,7 +1009,7 @@ void ConvCudnnGradGradKernel(
 
   int iwo_group = groups;
   int c_group = 1;
-#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
+#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) || defined(PADDLE_WITH_MUSA)
   iwo_group = 1;
   c_group = groups;
   groups = 1;
@@ -1061,6 +1066,11 @@ void ConvCudnnGradGradKernel(
   SearchResult<miopenConvFwdAlgorithm_t> fwd_result2;
   SearchResult<miopenConvBwdDataAlgorithm_t> data_result;
   SearchResult<miopenConvBwdWeightsAlgorithm_t> filter_result;
+#elif defined(PADDLE_WITH_MUSA)
+  SearchResult<mudnnConvFwdAlgorithm_t> fwd_result1;
+  SearchResult<mudnnConvFwdAlgorithm_t> fwd_result2;
+  SearchResult<mudnnConvBwdDataAlgorithm_t> data_result;
+  SearchResult<mudnnConvBwdWeightsAlgorithm_t> filter_result;
 #else
   SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result1;
   SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result2;
@@ -1091,7 +1101,7 @@ void ConvCudnnGradGradKernel(
       workspace_size = search1::GetWorkspaceSize(args1);
       fwd_result1.algo = search1::Find<T>(
           args1, exhaustive_search, false, workspace_size, ctx);
-#else
+#else // CUDA & MUSA
       using search1 = SearchAlgorithm<ConvKind::kForward>;
       fwd_result1 = search1::Find<T>(ctx, args1, exhaustive_search, false);
       workspace_size = search1::GetWorkspaceSize(args1, fwd_result1.algo);
@@ -1116,7 +1126,7 @@ void ConvCudnnGradGradKernel(
           std::max(workspace_size, search2::GetWorkspaceSize(args2));
       fwd_result2.algo = search2::Find<T>(
           args2, exhaustive_search, false, workspace_size, ctx);
-#else
+#else // CUDA & MUSA
       using search2 = SearchAlgorithm<ConvKind::kForward>;
       fwd_result2 = search2::Find<T>(ctx, args2, exhaustive_search, false);
       workspace_size = std::max(
@@ -1142,7 +1152,7 @@ void ConvCudnnGradGradKernel(
     workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3));
     filter_result.algo = search3::Find<T>(
         args3, exhaustive_search, deterministic, workspace_size, ctx);
-#else
+#else // CUDA & MUSA
     using search3 = SearchAlgorithm<ConvKind::kBackwardFilter>;
     filter_result =
         search3::Find<T>(ctx, args3, exhaustive_search, deterministic);
@@ -1169,7 +1179,7 @@ void ConvCudnnGradGradKernel(
     workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4));
     data_result.algo = search4::Find<T>(
         args4, exhaustive_search, deterministic, workspace_size, ctx);
-#else
+#else // CUDA & MUSA
     using search4 = SearchAlgorithm<ConvKind::kBackwardData>;
     data_result =
         search4::Find<T>(ctx, args4, exhaustive_search, deterministic);
@@ -1226,7 +1236,7 @@ void ConvCudnnGradGradKernel(
                                                        workspace_size));
           },
           workspace_size);
-#else
+#else // MUSA & CUDA
       ConvRunner<T, ConvKind::kForward>::Apply(ctx,
                                                args1,
                                                fwd_result1,
@@ -1345,7 +1355,7 @@ void ConvCudnnGradGradKernel(
                   workspace_size));
         },
         workspace_size);
-#else
+#else // CUDA & MUSA
     ConvRunner<T, ConvKind::kBackwardData>::Apply(ctx,
                                                   args4,
                                                   data_result,
@@ -1540,7 +1550,7 @@ PD_REGISTER_KERNEL(depthwise_conv2d_double_grad,
                    double,
                    phi::dtype::float16,
                    phi::dtype::bfloat16) {}
-#else
+#else // CUDA & MUSA
 PD_REGISTER_KERNEL(conv2d_grad,
                    GPUDNN,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpudnn/conv_kernel.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu
index 15161dd61c697..e73ce989f0306 100644
--- a/paddle/phi/kernels/gpudnn/conv_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_kernel.cu
@@ -23,6 +23,8 @@
 
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/kernels/gpudnn/conv_miopen_helper.h"
+#elif defined(PADDLE_WITH_MUSA)
+#include "paddle/phi/kernels/gpudnn/conv_mudnn_helper.h"
 #else
 #include "paddle/phi/kernels/gpudnn/conv_cudnn_v7.h"
 #endif
@@ -84,7 +86,7 @@ void ConvCudnnKernelImplV7(const DenseTensor* transformed_input,
   // MIOPEN need to set groups in cdesc in miopen_desc.h
   args.cdesc.set(
       dtype, padding_common, strides, dilations, phi::AllowTF32Cudnn(), groups);
-#else
+#else // CUDA & MUSA
   args.cdesc.set(
       dtype, padding_common, strides, dilations, phi::AllowTF32Cudnn());
 #endif
@@ -151,6 +153,11 @@ void ConvCudnnKernelImplV7(const DenseTensor* transformed_input,
   workspace_size = search::GetWorkspaceSize(args);
   fwd_result.algo = search::Find<T>(
       args, exhaustive_search, deterministic, workspace_size, ctx);
+#elif defined(PADDLE_WITH_MUSA)
+  SearchResult<mudnnConvolutionFwdAlgo_t> fwd_result;
+  using search = SearchAlgorithm<ConvKind::kForward>;
+  fwd_result = search::Find<T>(ctx, args, exhaustive_search, deterministic);
+  workspace_size = fwd_result.workspace_size;
 #else
   SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result;
   using search = SearchAlgorithm<ConvKind::kForward>;
@@ -195,7 +202,7 @@ void ConvCudnnKernelImplV7(const DenseTensor* transformed_input,
                                                    workspace_size));
       },
       workspace_size);
-#else
+#else // CUDA & MUSA
   ConvRunner<T, ConvKind::kForward>::Apply(ctx,
                                            args,
                                            fwd_result,
@@ -363,7 +370,7 @@ void ConvCudnnKernel(const Context& ctx,
   const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
   auto dtype = phi::backends::gpu::CudnnDataType<T>::type;
 
-#ifdef PADDLE_WITH_HIP
+#ifd defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   // HIP MIOPEN ONLY SUPPORT NCHW format
   auto compute_format = phi::backends::gpu::DataLayout::kNCHW;
 #else
@@ -651,7 +658,7 @@ PD_REGISTER_KERNEL(conv3d,
                    double,
                    phi::dtype::float16,
                    phi::dtype::bfloat16) {}
-#else
+#else // CUDA & MUSA
 PD_REGISTER_KERNEL(conv2d,
                    GPUDNN,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
index 50bae0a8bca3e..f30361864dbb0 100644
--- a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
@@ -32,6 +32,9 @@ limitations under the License. */
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/backends/gpu/rocm/miopen_helper.h"
 #include "paddle/phi/kernels/gpudnn/conv_miopen_helper.h"
+#elif defined(PADDLE_WITH_MUSA)
+#include "paddle/phi/backends/gpu/musa/musa_helper.h"
+#include "paddle/phi/kernels/gpudnn/conv_musa_helper.h"
 #else
 #include "paddle/phi/backends/gpu/cuda/cudnn_helper.h"
 #include "paddle/phi/kernels/gpudnn/conv_cudnn_v7.h"
@@ -167,7 +170,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
 
   int iwo_groups = groups;
   int c_groups = 1;
-#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
+#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) || defined(PADDLE_WITH_MUSA)
   iwo_groups = 1;
   c_groups = groups;
   groups = 1;
@@ -200,6 +203,9 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
 #ifdef PADDLE_WITH_HIP
   SearchResult<miopenConvFwdAlgorithm_t> fwd_result;
   SearchResult<miopenConvBwdWeightsAlgorithm_t> filter_result;
+#elif defined(PADDLE_WITH_MUSA)
+  SearchResult<mudnnConvFwdAlgorithm_t> fwd_result;
+  SearchResult<mudnnConvBwdWeightsAlgorithm_t> filter_result;
 #else
   SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result;
   SearchResult<cudnnConvolutionBwdFilterAlgo_t> filter_result;
@@ -228,7 +234,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
     workspace_size = std::max(workspace_size, search1::GetWorkspaceSize(args1));
     fwd_result.algo =
         search1::Find<T>(args1, false, deterministic, workspace_size, ctx);
-#else
+#else // MUSA & CUDA
     using search1 = SearchAlgorithm<ConvKind::kForward>;
     fwd_result = search1::Find<T>(ctx, args1, false, deterministic, false);
     workspace_size = std::max(
@@ -253,7 +259,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
     workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2));
     filter_result.algo =
         search2::Find<T>(args2, false, deterministic, workspace_size, ctx);
-#else
+#else // CUDA & MUSA
     using search2 = SearchAlgorithm<ConvKind::kBackwardFilter>;
     filter_result = search2::Find<T>(ctx, args2, false, deterministic, false);
     workspace_size = std::max(
@@ -292,7 +298,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
       };
       workspace_handle.RunFunc(cudnn_func, workspace_size);
     }
-#else   // PADDLE_WITH_HIP
+#else   // CUDA & MUSA
     ConvRunner<T, ConvKind::kForward>::Apply(ctx,
                                              args1,
                                              fwd_result,
@@ -349,7 +355,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
       };
       workspace_handle.RunFunc(cudnn_func, workspace_size);
     }
-#else   // PADDLE_WITH_HIP
+#else   // CUDA & MUSA
     ConvRunner<T, ConvKind::kBackwardFilter>::Apply(ctx,
                                                     args2,
                                                     filter_result,
@@ -363,7 +369,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
                                                     workspace_size,
                                                     &workspace_handle,
                                                     false);
-#endif  // PADDLE_WITH_HIP
+#endif
   }
 }
 
@@ -613,7 +619,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
 
   int iwo_group = groups;
   int c_group = 1;
-#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
+#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) || defined(PADDLE_WITH_MUSA)
   iwo_group = 1;
   c_group = groups;
   groups = 1;
@@ -670,6 +676,11 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
   SearchResult<miopenConvBwdDataAlgorithm_t> bwd_result2;
   SearchResult<miopenConvBwdWeightsAlgorithm_t> filter_result;
   SearchResult<miopenConvFwdAlgorithm_t> fwd_result;
+#elif defined(PADDLE_WITH_MUSA)
+  SearchResult<mudnnConvBwdDataAlgorithm_t> bwd_result1;
+  SearchResult<mudnnConvBwdDataAlgorithm_t> bwd_result2;
+  SearchResult<mudnnConvBwdWeightsAlgorithm_t> filter_result;
+  SearchResult<mudnnConvFwdAlgorithm_t> fwd_result;
 #else
   SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result1;
   SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result2;
@@ -700,7 +711,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
     workspace_size = search1::GetWorkspaceSize(args1);
     bwd_result1.algo =
         search1::Find<T>(args1, false, deterministic, workspace_size, ctx);
-#else
+#else // CUDA & MUSA
     using search1 = SearchAlgorithm<ConvKind::kBackwardData>;
     bwd_result1 = search1::Find<T>(ctx, args1, false, deterministic, false);
     workspace_size = search1::GetWorkspaceSize(args1, bwd_result1.algo);
@@ -722,7 +733,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
     workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2));
     bwd_result2.algo =
         search2::Find<T>(args2, false, deterministic, workspace_size, ctx);
-#else
+#else // CUDA & MUSA
     using search2 = SearchAlgorithm<ConvKind::kBackwardData>;
     bwd_result2 = search2::Find<T>(ctx, args2, false, deterministic, false);
     workspace_size = std::max(
@@ -747,7 +758,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
     workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3));
     filter_result.algo =
         search3::Find<T>(args3, false, deterministic, workspace_size, ctx);
-#else
+#else // CUDA & MUSA
     using search3 = SearchAlgorithm<ConvKind::kBackwardFilter>;
     filter_result = search3::Find<T>(ctx, args3, false, deterministic, false);
     workspace_size = std::max(
@@ -773,7 +784,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
     workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4));
     fwd_result.algo =
         search4::Find<T>(args4, false, deterministic, workspace_size, ctx);
-#else
+#else // CUDA & MUSA
     using search4 = SearchAlgorithm<ConvKind::kForward>;
     fwd_result = search4::Find<T>(ctx, args4, false, deterministic, false);
     workspace_size = std::max(
@@ -833,7 +844,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
           },
           workspace_size);
     }
-#else   // PADDLE_WITH_HIP
+#else   // CUDA & MUSA
     ConvRunner<T, ConvKind::kBackwardData>::Apply(ctx,
                                                   args1,
                                                   bwd_result1,
@@ -847,7 +858,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
                                                   workspace_size,
                                                   &workspace_handle,
                                                   false);
-#endif  // PADDLE_WITH_HIP
+#endif
 
 #ifdef PADDLE_WITH_HIP
     for (int i = 0; i < groups; i++) {
@@ -886,7 +897,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
           args2.idesc.desc(),
           transformed_ddout_channel_ + i * group_offset_out));
     }
-#else   // PADDLE_WITH_HIP
+#else   // CUDA & MUSA
     ConvRunner<T, ConvKind::kBackwardData>::Apply(ctx,
                                                   args2,
                                                   bwd_result2,
@@ -900,7 +911,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
                                                   workspace_size,
                                                   &workspace_handle,
                                                   true);
-#endif  // PADDLE_WITH_HIP
+#endif
 
     if ((!is_sys_pad) && (!channel_last)) {
       if (strides.size() == 2U) {
@@ -956,7 +967,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
           },
           workspace_size);
     }
-#else   // PADDLE_WITH_HIP
+#else // MUSA & CUDA
     ConvRunner<T, ConvKind::kBackwardFilter>::Apply(ctx,
                                                     args3,
                                                     filter_result,
@@ -970,7 +981,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
                                                     workspace_size,
                                                     &workspace_handle,
                                                     false);
-#endif  // PADDLE_WITH_HIP
+#endif
   }
 
   if (dx) {
@@ -996,7 +1007,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
           },
           workspace_size);
     }
-#else   // PADDLE_WITH_HIP
+#else   // MUSA & CUDA
     ConvRunner<T, ConvKind::kForward>::Apply(ctx,
                                              args4,
                                              fwd_result,
@@ -1010,7 +1021,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
                                              workspace_size,
                                              &workspace_handle,
                                              false);
-#endif  // PADDLE_WITH_HIP
+#endif
 
     if (channel_last) {
       TransToChannelLast<Context, T>(ctx, &transformed_dx_channel, dx);
@@ -1097,7 +1108,7 @@ PD_REGISTER_KERNEL(conv3d_transpose_grad,
                    double,
                    float16,
                    phi::dtype::bfloat16) {}
-#else
+#else // CUDA & MUSA
 PD_REGISTER_KERNEL(conv2d_transpose_grad,
                    GPUDNN,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
index df360ab388a6d..ed64723a40e4f 100644
--- a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
@@ -30,6 +30,9 @@ limitations under the License. */
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/backends/gpu/rocm/miopen_helper.h"
 #include "paddle/phi/kernels/gpudnn/conv_miopen_helper.h"
+#elif defined(PADDLE_WITH_MUSA)
+#include "paddle/phi/backends/gpu/rocm/mudnn_helper.h"
+#include "paddle/phi/kernels/gpudnn/conv_mudnn_helper.h"
 #else
 #include "paddle/phi/backends/gpu/cuda/cudnn_helper.h"
 #include "paddle/phi/kernels/gpudnn/conv_cudnn_v7.h"
@@ -176,7 +179,7 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
 
   int iwo_groups = groups;
   int c_groups = 1;
-#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
+#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) || defined(PADDLE_WTIH_MUSA)
   iwo_groups = 1;
   c_groups = groups;
   groups = 1;
@@ -191,6 +194,8 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
   size_t workspace_size = 0;
 #ifdef PADDLE_WITH_HIP
   miopenConvBwdDataAlgorithm_t algo{};
+#elif defined(PADDLE_WITH_MUSA)
+  mudnnConvBwdDataAlgorithm_t algo{};
 #else
   cudnnConvolutionBwdDataAlgo_t algo{};
 #endif
@@ -227,6 +232,12 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
   workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args));
   bwd_result.algo =
       search::Find<T>(args, false, deterministic, workspace_size, ctx);
+#elif defined(PADDLE_WITH_MUSA)
+  SearchResult<mudnnConvBwdDataAlgorithm_t> bwd_result;
+  using search = SearchAlgorithm<mudnnConvBwdDataAlgorithm_t>;
+  workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args));
+  bwd_result.algo =
+      search::Find<T>(args, false, deterministic, workspace_size, ctx);
 #else
   SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result;
   using search = SearchAlgorithm<ConvKind::kBackwardData>;
@@ -262,7 +273,7 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
     };
     workspace_handle.RunFunc(cudnn_func, workspace_size);
   }
-#else   // PADDLE_WITH_HIP
+#else   // CUDA & MUSA
   ConvRunner<T, ConvKind::kBackwardData>::Apply(ctx,
                                                 args,
                                                 bwd_result,
@@ -276,7 +287,7 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
                                                 workspace_size,
                                                 &workspace_handle,
                                                 false);
-#endif  // PADDLE_WITH_HIP
+#endif
 
   if (!is_sys_pad && strides.size() == 2U) {
     funcs::Slice<Context, T, 4>(ctx, &transformed_out, out, starts, ends, axes);
@@ -385,7 +396,7 @@ PD_REGISTER_KERNEL(conv3d_transpose,
                    double,
                    float16,
                    phi::dtype::bfloat16) {}
-#else
+#else // CUDA & MUSA
 PD_REGISTER_KERNEL(conv2d_transpose,
                    GPUDNN,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu b/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu
index 1161040f2163f..a52e0e37d0e71 100644
--- a/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu
@@ -154,7 +154,7 @@ void PoolGradRawGPUDNNKernel(const Context& ctx,
     // input grad
     transformed_input_grad.Resize(make_ddim(in_dims_vec));
 
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     // MIOPEN not support NHWC data layout
   } else if (data_format == str_NHWC) {
     layout = GPUDNNDataLayout::kNCHW;
@@ -217,6 +217,11 @@ void PoolGradRawGPUDNNKernel(const Context& ctx,
       layout, vectorize<int>(transformed_input.dims()));
   miopenTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
       layout, vectorize<int>(transformed_output.dims()));
+#elif defined(PADDLE_WITH_MUSA)
+  mudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
+      layout, vectorize<int>(transformed_input.dims()));
+  mudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
+      layout, vectorize<int>(transformed_output.dims()));
 #else
   cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
       layout, vectorize<int>(transformed_input.dims()));
@@ -238,6 +243,9 @@ void PoolGradRawGPUDNNKernel(const Context& ctx,
 #ifdef PADDLE_WITH_HIP
   miopenPoolingDescriptor_t cudnn_pool_desc =
       pool_desc.descriptor(pooling_mode, kernel_size_, paddings_, strides);
+#elif defined(PADDLE_WITH_MUSA)
+  mudnnPoolingDescriptor_t cudnn_pool_desc =
+      pool_desc.descriptor(pooling_mode, kernel_size_, paddings_, strides);
 #else
   cudnnPoolingDescriptor_t cudnn_pool_desc =
       pool_desc.descriptor(pooling_mode, kernel_size_, paddings_, strides);
@@ -269,6 +277,17 @@ void PoolGradRawGPUDNNKernel(const Context& ctx,
                                                               input_grad_data,
                                                               pool_workspace));
     PADDLE_ENFORCE_GPU_SUCCESS(hipFree(pool_workspace));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::mudnnPoolingBackward(handle,
+                                                             cudnn_pool_desc,
+                                                             &alpha,
+                                                             cudnn_output_desc,
+                                                             output_data,
+                                                             cudnn_output_desc,
+                                                             output_grad_data,
+                                                             cudnn_input_desc,
+                                                             input_data,
+                                                             &beta,
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnPoolingBackward(handle,
                                                              cudnn_pool_desc,
@@ -289,7 +308,7 @@ void PoolGradRawGPUDNNKernel(const Context& ctx,
       funcs::Transpose<Context, T, 5> trans5_v4;
       trans5_v4(ctx, transformed_input_grad, input_grad, axis);
     }
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     // MIOPEN not support NHWC data layout
     if (data_format == str_NHWC) {
       std::vector<int> axis{0, 2, 3, 1};
@@ -424,7 +443,7 @@ PD_REGISTER_KERNEL(pool3d_grad,
                    phi::Pool3dGradGPUDNNKernel,
                    float,
                    float16) {}
-#else
+#else // CUDA & MUSA
 PD_REGISTER_KERNEL(pool2d_grad,
                    GPUDNN,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpudnn/pool_kernel.cu b/paddle/phi/kernels/gpudnn/pool_kernel.cu
index b1a79dd874068..8a6ceb29690d2 100644
--- a/paddle/phi/kernels/gpudnn/pool_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/pool_kernel.cu
@@ -111,8 +111,8 @@ void PoolRawGPUDNNKernel(const Context& ctx,
     out_dims_vec[3] = output->dims()[2];
     out_dims_vec[4] = output->dims()[3];
     transformed_output.Resize(make_ddim(out_dims_vec));
-#ifdef PADDLE_WITH_HIP
-    // MIOPEN not support NHWC data layout
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+    // MIOPEN and MUDNN not support NHWC data layout
   } else if (data_format == str_NHWC) {
     layout = GPUDNNDataLayout::kNCHW;
 
@@ -155,6 +155,11 @@ void PoolRawGPUDNNKernel(const Context& ctx,
       layout, vectorize<int>(transformed_input.dims()));
   miopenTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
       layout, vectorize<int>(transformed_output.dims()));
+#elif defined(PADDLE_WITH_MUSA)
+  mudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
+      layout, vectorize<int>(transformed_input.dims()));
+  mudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
+      layout, vectorize<int>(transformed_output.dims()));
 #else
   cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
       layout, vectorize<int>(transformed_input.dims()));
@@ -172,6 +177,9 @@ void PoolRawGPUDNNKernel(const Context& ctx,
 #ifdef PADDLE_WITH_HIP
   miopenPoolingDescriptor_t cudnn_pool_desc =
       pool_desc.descriptor(pooling_mode, kernel_size_, paddings_, strides);
+#elif defined(PADDLE_WITH_MUSA)
+  mudnnPoolingDescriptor_t cudnn_pool_desc =
+      pool_desc.descriptor(pooling_mode, kernel_size_, paddings_, strides);
 #else
   cudnnPoolingDescriptor_t cudnn_pool_desc =
       pool_desc.descriptor(pooling_mode, kernel_size_, paddings_, strides);
@@ -200,6 +208,16 @@ void PoolRawGPUDNNKernel(const Context& ctx,
                                     pool_workspace,
                                     pool_workernel_size_));
   PADDLE_ENFORCE_GPU_SUCCESS(hipFree(pool_workspace));
+#elif defined(PADDLE_WITH_MUSA)
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      dynload::mudnnPoolingForward(handle,
+                                   cudnn_pool_desc,
+                                   &alpha,
+                                   cudnn_input_desc,
+                                   tranformed_input_data,
+                                   &beta,
+                                   cudnn_output_desc,
+                                   tranformed_output_data));
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(
       dynload::cudnnPoolingForward(handle,
@@ -217,7 +235,7 @@ void PoolRawGPUDNNKernel(const Context& ctx,
     funcs::Transpose<Context, T, 5> trans5_v2;
     trans5_v2(ctx, transformed_output, output, axis);
   }
-#ifdef PADDLE_WITH_HIP
+#elif defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   // MIOPEN not support NHWC data layout
   if (data_format == str_NHWC) {
     std::vector<int> axis{0, 2, 3, 1};
@@ -295,7 +313,7 @@ PD_REGISTER_KERNEL(
     pool2d, GPUDNN, ALL_LAYOUT, phi::Pool2dGPUDNNKernel, float, float16) {}
 PD_REGISTER_KERNEL(
     pool3d, GPUDNN, ALL_LAYOUT, phi::Pool3dGPUDNNKernel, float, float16) {}
-#else
+#else // CUDA & MUSA
 PD_REGISTER_KERNEL(pool2d,
                    GPUDNN,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpudnn/softmax_grad_kernel.cu b/paddle/phi/kernels/gpudnn/softmax_grad_kernel.cu
index 72a5f37d14005..93dff54fa128c 100644
--- a/paddle/phi/kernels/gpudnn/softmax_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/softmax_grad_kernel.cu
@@ -59,7 +59,7 @@ PD_REGISTER_KERNEL(softmax_grad,
                    double,
                    phi::dtype::float16,
                    phi::dtype::bfloat16) {}
-#else
+#else // CUDA & MUSA
 PD_REGISTER_KERNEL(softmax_grad,
                    GPUDNN,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/impl/conv_cudnn_impl.h b/paddle/phi/kernels/impl/conv_cudnn_impl.h
index c918eeec83121..acf2fd4808814 100644
--- a/paddle/phi/kernels/impl/conv_cudnn_impl.h
+++ b/paddle/phi/kernels/impl/conv_cudnn_impl.h
@@ -19,6 +19,8 @@
 #include "paddle/phi/core/kernel_registry.h"
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/kernels/gpudnn/conv_miopen_helper.h"
+#elif defined(PADDLE_WITH_MUSA)
+// TODO
 #else
 #include "paddle/phi/kernels/gpudnn/conv_cudnn_v7.h"
 #endif
diff --git a/paddle/phi/kernels/impl/isclose_kernel_impl.h b/paddle/phi/kernels/impl/isclose_kernel_impl.h
index de59cb0c32ca1..f74094184e33f 100644
--- a/paddle/phi/kernels/impl/isclose_kernel_impl.h
+++ b/paddle/phi/kernels/impl/isclose_kernel_impl.h
@@ -145,6 +145,8 @@ struct IscloseFunctor<phi::GPUContext, T> {
     grid = (grid > block) ? block : grid;
 #ifdef PADDLE_WITH_HIP
     hipMemset(out_data, true, num * sizeof(bool));
+#elif defined(PADDLE_WITH_MUSA)
+    musaMemset(out_data, true, num * sizeof(bool));
 #else
     cudaMemset(out_data, true, num * sizeof(bool));
 #endif
diff --git a/paddle/phi/kernels/impl/segment_pool_kernel_impl.h b/paddle/phi/kernels/impl/segment_pool_kernel_impl.h
index 82b99b07a8927..f5a0998505dce 100644
--- a/paddle/phi/kernels/impl/segment_pool_kernel_impl.h
+++ b/paddle/phi/kernels/impl/segment_pool_kernel_impl.h
@@ -77,6 +77,11 @@ void SegmentKernelLaunchHelper(const Context& dev_ctx,
                                          segment_ids_ptr + num_indices - 1,
                                          sizeof(IndexT),
                                          hipMemcpyDeviceToHost));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpy(length_data,
+                                          segment_ids_ptr + num_indices - 1,
+                                          sizeof(IndexT),
+                                          musaMemcpyDeviceToHost));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(length_data,
                                           segment_ids_ptr + num_indices - 1,
diff --git a/paddle/phi/kernels/primitive/datamover_primitives.h b/paddle/phi/kernels/primitive/datamover_primitives.h
index 2a3579d99cfe6..8778dc144e503 100644
--- a/paddle/phi/kernels/primitive/datamover_primitives.h
+++ b/paddle/phi/kernels/primitive/datamover_primitives.h
@@ -20,6 +20,10 @@
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_fp16.h>
 #endif
+#ifdef PADDLE_WITH_MUSA
+#include <musa.h>
+#include <musa_fp16.h>
+#endif
 #include "paddle/phi/core/ddim.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/reduce_min_kernel.cc b/paddle/phi/kernels/reduce_min_kernel.cc
index ff50e9d1077b0..c5219c32cb743 100644
--- a/paddle/phi/kernels/reduce_min_kernel.cc
+++ b/paddle/phi/kernels/reduce_min_kernel.cc
@@ -57,6 +57,11 @@ PD_REGISTER_KERNEL(
     min, GPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t) {}
 #endif
 
+#if defined(PADDLE_WITH_MUSA)
+PD_REGISTER_KERNEL()
+    min, GPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t, phi::dtype::float16) {}
+#endif
+
 #if defined(PADDLE_WITH_XPU_KP) && !defined(PADDLE_WITH_XPU)
 PD_REGISTER_KERNEL(min, KPS, ALL_LAYOUT, phi::MinKernel, float) {}
 #endif
diff --git a/paddle/phi/kernels/sparse/batch_norm_grad_kernel.cc b/paddle/phi/kernels/sparse/batch_norm_grad_kernel.cc
index ff3173ec0a101..4bd01b667516b 100644
--- a/paddle/phi/kernels/sparse/batch_norm_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/batch_norm_grad_kernel.cc
@@ -88,7 +88,7 @@ PD_REGISTER_KERNEL(batch_norm_coo_grad,
 }
 #endif
 
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(batch_norm_coo_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/sparse/batch_norm_kernel.cc b/paddle/phi/kernels/sparse/batch_norm_kernel.cc
index 04ab36892513c..5ea531bbab1c4 100644
--- a/paddle/phi/kernels/sparse/batch_norm_kernel.cc
+++ b/paddle/phi/kernels/sparse/batch_norm_kernel.cc
@@ -92,7 +92,7 @@ PD_REGISTER_KERNEL(batch_norm_coo,
 }
 #endif
 
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(batch_norm_coo,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu b/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu
index aaed804c92657..3366a86850bd2 100644
--- a/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu
@@ -89,6 +89,8 @@ void CoalesceCooGPUKernel(const GPUContext& dev_ctx,
 // 3. sort (indices, values index)
 #ifdef PADDLE_WITH_HIP
   thrust::sort_by_key(thrust::hip::par.on(dev_ctx.stream()),
+#elif defined(PADDLE_WITH_MUSA)
+  thrust::sort_by_key(thrust::musa::par.on(dev_ctx.stream()),
 #else
   thrust::sort_by_key(thrust::cuda::par.on(dev_ctx.stream()),
 #endif
@@ -100,6 +102,8 @@ void CoalesceCooGPUKernel(const GPUContext& dev_ctx,
   thrust::pair<IntT*, int*> new_end =
 #ifdef PADDLE_WITH_HIP
       thrust::unique_by_key(thrust::hip::par.on(dev_ctx.stream()),
+#elif defined(PADDLE_WITH_MUSA)
+      thrust::unique_by_key(thrust::musa::par.on(dev_ctx.stream()),
 #else
       thrust::unique_by_key(thrust::cuda::par.on(dev_ctx.stream()),
 #endif
diff --git a/paddle/phi/kernels/sparse/gpu/conv.cu.h b/paddle/phi/kernels/sparse/gpu/conv.cu.h
index 689629c939338..68ca818bad303 100644
--- a/paddle/phi/kernels/sparse/gpu/conv.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/conv.cu.h
@@ -603,6 +603,8 @@ inline void CallThrustScan(const GPUContext& dev_ctx,
                            int* h_offsets_ptr) {
 #ifdef PADDLE_WITH_HIP
   thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
+#elif definfed(PADDLE_WITH_MUSA)
+  thrust::exclusive_scan(thrust::musa::par.on(dev_ctx.stream()),
 #else
   thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
 #endif
@@ -836,6 +838,8 @@ int ProductRuleBook(const Context& dev_ctx,
     // 2. remove -1
 #ifdef PADDLE_WITH_HIP
     IntT* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()),
+#elif defined(PADDLE_WITH_MUSA)
+    IntT* last = thrust::remove(thrust::musa::par.on(dev_ctx.stream()),
 #else
     IntT* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()),
 #endif
@@ -884,6 +888,8 @@ int ProductRuleBook(const Context& dev_ctx,
         index_flags_ptr, index_flags.numel(), out_index_table_ptr);
 #ifdef PADDLE_WITH_HIP
     thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
+#elif defined(PADDLE_WITH_MUSA)
+    thrust::exclusive_scan(thrust::musa::par.on(dev_ctx.stream()),
 #else
     thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
 #endif
diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
index a7dcb6d514830..3bc6cd3b9ab92 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
@@ -78,6 +78,8 @@ inline IntT* SortedAndUniqueIndex(const Context& dev_ctx,
                                      sizeof(IntT) * len,
 #ifdef PADDLE_WITH_HIP
                                      hipMemcpyDeviceToDevice,
+#elif defined(PADDLE_WITH_MUSA)
+                                     musaMemcpyDeviceToDevice,
 #else
                                      cudaMemcpyDeviceToDevice,
 #endif
@@ -86,6 +88,8 @@ inline IntT* SortedAndUniqueIndex(const Context& dev_ctx,
 // performance, but thrust::merge_by_key limited by data size
 #ifdef PADDLE_WITH_HIP
   thrust::sort_by_key(thrust::hip::par.on(dev_ctx.stream()),
+#elif defined(PADDLE_WITH_MUSA)
+  thrust::sort_by_key(thrust::musa::par.on(dev_ctx.stream()),
 #else
   thrust::sort_by_key(thrust::cuda::par.on(dev_ctx.stream()),
 #endif
@@ -97,6 +101,8 @@ inline IntT* SortedAndUniqueIndex(const Context& dev_ctx,
   thrust::pair<IntT*, int*> new_end =
 #ifdef PADDLE_WITH_HIP
       thrust::unique_by_key(thrust::hip::par.on(dev_ctx.stream()),
+#elif defined(PADDLE_WITH_MUSA)
+      thrust::unique_by_key(thrust::musa::par.on(dev_ctx.stream()),
 #else
       thrust::unique_by_key(thrust::cuda::par.on(dev_ctx.stream()),
 #endif
@@ -348,6 +354,8 @@ int ProductRuleBook(const Context& dev_ctx,
 // 2. remove -1
 #ifdef PADDLE_WITH_HIP
   IntT* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()),
+#elif defined(PADDLE_WITH_MUSA)
+  IntT* last = thrust::remove(thrust::musa::par.on(dev_ctx.stream()),
 #else
   IntT* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()),
 #endif
@@ -364,6 +372,8 @@ int ProductRuleBook(const Context& dev_ctx,
       sizeof(IntT),
 #ifdef PADDLE_WITH_HIP
       hipMemcpyDeviceToHost,
+#elif defined(PADDLE_WITH_MUSA)
+      musaMemcpyDeviceToHost,
 #else
       cudaMemcpyDeviceToHost,
 #endif
@@ -388,6 +398,8 @@ int ProductRuleBook(const Context& dev_ctx,
     IntT* bound_ptr = bound.data<IntT>();
 #ifdef PADDLE_WITH_HIP
     thrust::lower_bound(thrust::hip::par.on(dev_ctx.stream()),
+#elif defined(PADDLE_WITH_MUSA)
+    thrust::lower_bound(thrust::musa::par.on(dev_ctx.stream()),
 #else
     thrust::lower_bound(thrust::cuda::par.on(dev_ctx.stream()),
 #endif
@@ -415,6 +427,8 @@ int ProductRuleBook(const Context& dev_ctx,
 // remove -1
 #ifdef PADDLE_WITH_HIP
     IntT* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()),
+#elif defined(PADDLE_WITH_MUSA)
+    IntT* last = thrust::remove(thrust::musa::par.on(dev_ctx.stream()),
 #else
     IntT* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()),
 #endif
@@ -428,6 +442,8 @@ int ProductRuleBook(const Context& dev_ctx,
                                        sizeof(IntT),
 #ifdef PADDLE_WITH_HIP
                                        hipMemcpyDeviceToHost,
+#elif defined(PADDLE_WITH_MUSA)
+                                       musaMemcpyDeviceToHost,
 #else
                                        cudaMemcpyDeviceToHost,
 #endif
@@ -438,6 +454,8 @@ int ProductRuleBook(const Context& dev_ctx,
 
 #ifdef PADDLE_WITH_HIP
   thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
+#elif defined(PADDLE_WITH_MUSA)
+  thrust::exclusive_scan(thrust::musa::par.on(dev_ctx.stream()),
 #else
   thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
 #endif
@@ -450,6 +468,8 @@ int ProductRuleBook(const Context& dev_ctx,
                                      kernel_size * sizeof(int),
 #ifdef PADDLE_WITH_HIP
                                      hipMemcpyDeviceToHost,
+#elif defined(PADDLE_WITH_MUSA)
+                                     musaMemcpyDeviceToHost,
 #else
                                      cudaMemcpyDeviceToHost,
 #endif
@@ -460,6 +480,8 @@ int ProductRuleBook(const Context& dev_ctx,
                                      kernel_size * sizeof(int),
 #ifdef PADDLE_WITH_HIP
                                      hipMemcpyDeviceToHost,
+#elif defined(PADDLE_WITH_MUSA)
+                                     musaMemcpyDeviceToHost,
 #else
                                      cudaMemcpyDeviceToHost,
 #endif
@@ -501,6 +523,13 @@ int ProductRuleBook(const Context& dev_ctx,
         sizeof(IntT),
         hipMemcpyDeviceToHost,
         dev_ctx.stream());
+#elif defined(PADDLE_WITH_MUSA)
+    phi::backends::gpu::GpuMemcpyAsync(
+        &out_non_zero_num,
+        rulebook_ptr + rulebook_rows * rulebook_cols - 1,
+        sizeof(IntT),
+        musaMemcpyDeviceToHost,
+        dev_ctx.stream());
 #else
     phi::backends::gpu::GpuMemcpyAsync(
         &out_non_zero_num,
diff --git a/paddle/phi/kernels/sparse/gpu/elementwise_kernel.cu b/paddle/phi/kernels/sparse/gpu/elementwise_kernel.cu
index 47daa1eae19ed..b9b340da8caee 100644
--- a/paddle/phi/kernels/sparse/gpu/elementwise_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/elementwise_kernel.cu
@@ -43,6 +43,8 @@ void ElementWiseAddCooGPUKernel(const GPUContext& dev_ctx,
   const IntT* y_indices_ptr = y_indices.data<IntT>();
 #ifdef PADDLE_WITH_HIP
   bool is_same = thrust::equal(thrust::hip::par.on(dev_ctx.stream()),
+#elif defined(PADDLE_WITH_MUSA)
+  bool is_same = thrust::equal(thrust::musa::par.on(dev_ctx.stream()),
 #else
   bool is_same = thrust::equal(thrust::cuda::par.on(dev_ctx.stream()),
 #endif
diff --git a/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu
index 7dbdbe2acc992..fc526adeacec5 100644
--- a/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu
@@ -68,7 +68,7 @@ void MatmulCooDenseGradKernel(const Context& dev_ctx,
     set_zero(dev_ctx, dy, static_cast<T>(0.0f));
     sparse_blas.SPMM(
         true, false, static_cast<T>(1), x_csr, dout, static_cast<T>(0), dy);
-#elif defined(PADDLE_WITH_CUDA)
+#elif defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_MUSA)
     sparse_blas.SPMM(
         true, false, static_cast<T>(1), x, dout, static_cast<T>(0), dy);
 #endif
@@ -84,6 +84,10 @@ void MatmulCooDenseGradKernel(const Context& dev_ctx,
                                  "rocsparse_sddmm with transpose, which is "
                                  "supported from "
                                  "ROCM 4.3.0"));
+#elif defined(PADDLE_WITH_MUSA)
+  PADDLE_THROW(phi::errors::Unimplemented(
+      "backward of 'sparse.matmul' use cusparseSDDMM, which is supported from "
+      "MUSA xxx"));
 #endif
 #endif
 }
@@ -135,6 +139,10 @@ void MatmulCsrDenseGradKernel(const Context& dev_ctx,
                                  "rocsparse_sddmm with transpose, which is "
                                  "supported from "
                                  "ROCM 4.3.0"));
+#elif defined(PADDLE_WITH_MUSA)
+  PADDLE_THROW(phi::errors::Unimplemented(
+      "backward of 'sparse.matmul' use cusparseSDDMM, which is supported from "
+      "MUSA xxx"));
 #endif
 #endif
 }
diff --git a/paddle/phi/kernels/sparse/gpu/pool_kernel.cu b/paddle/phi/kernels/sparse/gpu/pool_kernel.cu
index 3f0ec2c2713e5..913581710dc3f 100644
--- a/paddle/phi/kernels/sparse/gpu/pool_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/pool_kernel.cu
@@ -103,6 +103,8 @@ void MaxPoolCooGPUKernel(const GPUContext& dev_ctx,
 // 2. max pool
 #ifdef PADDLE_WITH_HIP
   thrust::fill(thrust::hip::par.on(dev_ctx.stream()),
+#elif defined(PADDLE_WITH_MUSA)
+  thrust::fill(thrust::musa::par.on(dev_ctx.stream()),
 #else
   thrust::fill(thrust::cuda::par.on(dev_ctx.stream()),
 #endif
diff --git a/paddle/phi/kernels/sparse/gpu/slice_kernel.cu b/paddle/phi/kernels/sparse/gpu/slice_kernel.cu
index f47accfc8eff8..c998de2df3e46 100644
--- a/paddle/phi/kernels/sparse/gpu/slice_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/slice_kernel.cu
@@ -178,6 +178,8 @@ void SliceCooGPUCompute(const Context& dev_ctx,
   d_out_nnz_indices.Resize({out_nnz});
 #ifdef PADDLE_WITH_HIP
   thrust::sort(thrust::hip::par.on(dev_ctx.stream()),
+#elif defined(PADDLE_WITH_MUSA)
+  thrust::sort(thrust::musa::par.on(dev_ctx.stream()),
 #else
   thrust::sort(thrust::cuda::par.on(dev_ctx.stream()),
 #endif
@@ -322,6 +324,8 @@ void SliceCsrTensor2D(const Context& dev_ctx,
                                                         out_crows_data);
 #ifdef PADDLE_WITH_HIP
   thrust::inclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
+#ifdef PADDLE_WITH_MUSA
+  thrust::inclusive_scan(thrust::musa::par.on(dev_ctx.stream()),
 #else
   thrust::inclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
 #endif
@@ -471,6 +475,8 @@ void SliceCsrTensor3D(const Context& dev_ctx,
 
 #ifdef PADDLE_WITH_HIP
   thrust::inclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
+#elif defined(PADDLE_WITH_MUSA)
+  thrust::inclusive_scan(thrust::musa::par.on(dev_ctx.stream()),
 #else
   thrust::inclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
 #endif
@@ -536,6 +542,8 @@ void SliceCsrTensor3D(const Context& dev_ctx,
   int64_t out_nnz =
 #ifdef PADDLE_WITH_HIP
       thrust::reduce(thrust::hip::par.on(dev_ctx.stream()),
+#elif defined(PADDLE_WITH_MUSA)
+      thrust::reduce(thrust::musa::par.on(dev_ctx.stream()),
 #else
       thrust::reduce(thrust::cuda::par.on(dev_ctx.stream()),
 #endif
@@ -545,6 +553,8 @@ void SliceCsrTensor3D(const Context& dev_ctx,
     int64_t st = i * (out_n_rows + 1);
 #ifdef PADDLE_WITH_HIP
     thrust::inclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
+#elif defined(PADDLE_WITH_MUSA)
+    thrust::inclusive_scan(thrust::musa::par.on(dev_ctx.stream()),
 #else
     thrust::inclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
 #endif
@@ -554,6 +564,8 @@ void SliceCsrTensor3D(const Context& dev_ctx,
   }
 #ifdef PADDLE_WITH_HIP
   thrust::inclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
+#elif defined(PADDLE_WITH_MUSA)
+  thrust::inclusive_scan(thrust::musa::par.on(dev_ctx.stream()),
 #else
   thrust::inclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
 #endif
diff --git a/paddle/phi/kernels/sparse/gpu/softmax_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/softmax_grad_kernel.cu
index cf3dc79c8edd0..7be1b96b7ba52 100644
--- a/paddle/phi/kernels/sparse/gpu/softmax_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/softmax_grad_kernel.cu
@@ -216,6 +216,9 @@ void SoftmaxCooGradGPUKernel(const Context& dev_ctx,
 #ifdef PADDLE_WITH_HIP
   const auto& policy = thrust::hip::par.on(dev_ctx.stream());
   bool is_same_offset = thrust::equal(thrust::hip::par.on(dev_ctx.stream()),
+#elif defined(PADDLE_WITH_MUSA)
+  const auto& policy = thrust::musa::par.on(dev_ctx.stream());
+  bool is_same_offset = thrust::equal(thrust::hip::par.on(dev_ctx.stream()),
 #else
   const auto& policy = thrust::cuda::par.on(dev_ctx.stream());
   bool is_same_offset = thrust::equal(thrust::cuda::par.on(dev_ctx.stream()),
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
index 084cb0e60bb6d..abc1f18f984b2 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
@@ -19,6 +19,8 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/backends/dynload/rocsparse.h"
+#elif defined(PADDLE_WITH_MUSA)
+#include "paddle/phi/backends/dynload/musparse.h"
 #endif
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
@@ -132,6 +134,8 @@ void DenseToCooKernel(const Context& dev_ctx,
 
 #ifdef PADDLE_WITH_HIP
   thrust::remove(thrust::hip::par.on(dev_ctx.stream()),
+#elif defined(PADDLE_WITH_MUSA)
+  thrust::remove(thrust::musa::par.on(dev_ctx.stream()),
 #else
   thrust::remove(thrust::cuda::par.on(dev_ctx.stream()),
 #endif
@@ -228,7 +232,7 @@ void CsrToCooGPUKernel(const GPUContext& dev_ctx,
   if (x.nnz() <= 0) {
 #ifdef PADDLE_WITH_HIP
     DenseTensor indices = phi::Empty<int>(dev_ctx, {sparse_dim, non_zero_num});
-#else
+#else // MUSA and CUDA
     DenseTensor indices = phi::Empty<IntT>(dev_ctx, {sparse_dim, non_zero_num});
 #endif
     DenseTensor values = phi::EmptyLike<T, GPUContext>(dev_ctx, x.values());
@@ -243,7 +247,7 @@ void CsrToCooGPUKernel(const GPUContext& dev_ctx,
   const auto& csr_cols = Cast<IntT>(dev_ctx, x.cols(), DataType::INT32);
   const int* csr_crows_data = csr_crows.template data<int>();
   const int* csr_cols_data = csr_cols.template data<int>();
-#else
+#else // MUSA & CUDA
   const auto& csr_crows = x.crows();
   const auto& csr_cols = x.cols();
   const IntT* csr_crows_data = csr_crows.data<IntT>();
@@ -260,7 +264,7 @@ void CsrToCooGPUKernel(const GPUContext& dev_ctx,
   int* coo_indices = indices.data<int>();
   int* coo_rows_data = coo_indices;
   int* coo_cols_data = coo_rows_data + non_zero_num;
-#else
+#else // MUSA & CUDA
   DenseTensor indices = phi::Empty<IntT>(dev_ctx, {sparse_dim, non_zero_num});
   DenseTensor offsets = phi::Empty<IntT>(dev_ctx, {batches});
   IntT* coo_indices = indices.data<IntT>();
@@ -299,7 +303,7 @@ void CsrToCooGPUKernel(const GPUContext& dev_ctx,
                                     coo_rows_data,
                                     rocsparse_index_base_zero);
   });
-#else
+#else // MUSA & CUDA
   auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rows, 1);
   config.block_per_grid.y = batches;
   ConvertCsrCrowsToCooRows<IntT>
@@ -310,7 +314,7 @@ void CsrToCooGPUKernel(const GPUContext& dev_ctx,
                                      csr_cols_data,
 #ifdef PADDLE_WITH_HIP
                                      sizeof(int) * non_zero_num,
-#else
+#else // MUSA & CUDA
                                      sizeof(IntT) * non_zero_num,
 #endif
                                      gpuMemcpyDeviceToDevice,
diff --git a/paddle/phi/kernels/strings/gpu/copy_utils.h b/paddle/phi/kernels/strings/gpu/copy_utils.h
index a6c2aba97b5e8..c462ddec7a351 100644
--- a/paddle/phi/kernels/strings/gpu/copy_utils.h
+++ b/paddle/phi/kernels/strings/gpu/copy_utils.h
@@ -83,6 +83,9 @@ int GetAllStringsSize(const Context& dev_ctx,
 #ifdef PADDLE_WITH_HIP
   phi::backends::gpu::GpuMemcpyAsync(
       &num, nums_ptr, sizeof(int), hipMemcpyDeviceToHost, dev_ctx.stream());
+#elif defined(PADDLE_WITH_MUSA)
+  phi::backends::gpu::GpuMemcpyAsync(
+      &num, nums_ptr, sizeof(int), musaMemcpyDeviceToHost, dev_ctx.stream());
 #else
   phi::backends::gpu::GpuMemcpyAsync(
       &num, nums_ptr, sizeof(int), cudaMemcpyDeviceToHost, dev_ctx.stream());
@@ -179,6 +182,9 @@ void DeserializeOnGPU(const phi::GPUContext& dev_ctx,
 #ifdef PADDLE_WITH_HIP
   phi::backends::gpu::GpuMemcpySync(
       &numel, strings_data, sizeof(numel), hipMemcpyDeviceToHost);
+#elif defined(PADDLE_WITH_MUSA)
+  phi::backends::gpu::GpuMemcpySync(
+      &numel, strings_data, sizeof(numel), musaMemcpyDeviceToHost);
 #else
   phi::backends::gpu::GpuMemcpySync(
       &numel, strings_data, sizeof(numel), cudaMemcpyDeviceToHost);
diff --git a/paddle/phi/kernels/strings/unicode.cc b/paddle/phi/kernels/strings/unicode.cc
index 75e48f1ce982e..e39fcdc0181a6 100644
--- a/paddle/phi/kernels/strings/unicode.cc
+++ b/paddle/phi/kernels/strings/unicode.cc
@@ -57,6 +57,10 @@ const uint8_t* GetGPUUniflagMap() {
     hipMalloc(reinterpret_cast<void**>(&gpu_uniflag), size);
     phi::backends::gpu::GpuMemcpySync(
         gpu_uniflag, cpu_uniflag, size, hipMemcpyHostToDevice);
+#elif defined(PADDLE_WITH_MUSA)
+    musaMalloc(reinterpret_cast<void**>(&gpu_uniflag), size);
+    phi::backends::gpu::GpuMemcpySync(
+        gpu_uniflag, cpu_uniflag, size, musaMemcpyHostToDevice);
 #else
     cudaMalloc(reinterpret_cast<void**>(&gpu_uniflag), size);
     phi::backends::gpu::GpuMemcpySync(
@@ -76,6 +80,10 @@ const uint16_t* GetGPUCharcasesMap() {
     hipMalloc(reinterpret_cast<void**>(&gpu_charcases), size);
     phi::backends::gpu::GpuMemcpySync(
         gpu_charcases, cpu_charcases, size, hipMemcpyHostToDevice);
+#elif defined(PADDLE_WITH_MUSA)
+    musaMalloc(reinterpret_cast<void**>(&gpu_charcases), size);
+    phi::backends::gpu::GpuMemcpySync(
+        gpu_charcases, cpu_charcases, size, musaMemcpyHostToDevice);
 #else
     cudaMalloc(reinterpret_cast<void**>(&gpu_charcases), size);
     phi::backends::gpu::GpuMemcpySync(

From c3133ecb9f5c53c0de704f6c1d9a2c7a2ff26f9c Mon Sep 17 00:00:00 2001
From: CaiZhi <zhi.cai@mthreads.com>
Date: Wed, 26 Jul 2023 20:07:31 +0800
Subject: [PATCH 09/55] [MTAI] feat(build): fix compiling error for MUSA

---
 cmake/configure.cmake                         |   5 +
 paddle/fluid/platform/device/gpu/gpu_types.h  |   9 +-
 .../platform/device/gpu/musa/musa_helper.h    |   0
 paddle/fluid/platform/device_context.h        |  14 +-
 paddle/fluid/platform/dynload/mublas.h        |   0
 paddle/fluid/platform/dynload/mudnn.h         |   0
 paddle/fluid/platform/dynload/musa_driver.cc  |  24 +++
 paddle/fluid/platform/dynload/musa_driver.h   |  26 +++
 paddle/fluid/platform/dynload/musartc.cc      |  24 +++
 paddle/fluid/platform/dynload/musartc.h       |  26 +++
 paddle/fluid/platform/dynload/musparse.h      |   0
 paddle/fluid/platform/enforce.h               |   4 +-
 paddle/phi/api/include/tensor.h               |   4 +-
 paddle/phi/api/lib/tensor_utils.cc            |   8 +
 paddle/phi/backends/device_code.cc            | 110 ++---------
 paddle/phi/backends/device_code.h             |   4 +-
 paddle/phi/backends/dynload/mublas.h          |   6 +
 paddle/phi/backends/dynload/mudnn.cc          |  24 +++
 paddle/phi/backends/dynload/mudnn.h           |  28 +++
 paddle/phi/backends/dynload/murand.h          |   0
 paddle/phi/backends/dynload/musa_driver.cc    |  24 +++
 paddle/phi/backends/dynload/musa_driver.h     |  24 +++
 paddle/phi/backends/dynload/musartc.cc        |  24 +++
 paddle/phi/backends/dynload/musartc.h         |  24 +++
 paddle/phi/backends/dynload/musparse.h        |   0
 paddle/phi/backends/gpu/forwards.h            |   5 +
 paddle/phi/backends/gpu/gpu_context.cc        |  22 +--
 paddle/phi/backends/gpu/gpu_decls.h           |  34 ++--
 paddle/phi/backends/gpu/gpu_dnn.h             |   3 +-
 paddle/phi/backends/gpu/gpu_launch_config.h   |   2 +
 paddle/phi/backends/gpu/gpu_resources.cc      |  50 ++---
 paddle/phi/backends/gpu/gpu_types.h           |  51 ++---
 paddle/phi/backends/gpu/musa/musa_helper.h    |   0
 paddle/phi/backends/gpu/musa/musa_info.cc     |   6 +-
 paddle/phi/backends/musartc.h                 |  24 +++
 paddle/phi/core/enforce.h                     | 181 +++++++++++++++++-
 paddle/phi/kernels/CMakeLists.txt             |   8 +-
 paddle/phi/kernels/funcs/CMakeLists.txt       |   5 +-
 paddle/phi/kernels/impl/warpctc_kernel_impl.h |   2 +-
 .../phi/kernels/impl/warprnnt_kernel_impl.h   |   2 +-
 40 files changed, 599 insertions(+), 208 deletions(-)
 create mode 100644 paddle/fluid/platform/device/gpu/musa/musa_helper.h
 create mode 100644 paddle/fluid/platform/dynload/mublas.h
 create mode 100644 paddle/fluid/platform/dynload/mudnn.h
 create mode 100644 paddle/fluid/platform/dynload/musa_driver.cc
 create mode 100644 paddle/fluid/platform/dynload/musa_driver.h
 create mode 100644 paddle/fluid/platform/dynload/musartc.cc
 create mode 100644 paddle/fluid/platform/dynload/musartc.h
 create mode 100644 paddle/fluid/platform/dynload/musparse.h
 create mode 100644 paddle/phi/backends/dynload/mublas.h
 create mode 100644 paddle/phi/backends/dynload/mudnn.cc
 create mode 100644 paddle/phi/backends/dynload/mudnn.h
 create mode 100644 paddle/phi/backends/dynload/murand.h
 create mode 100644 paddle/phi/backends/dynload/musa_driver.cc
 create mode 100644 paddle/phi/backends/dynload/musa_driver.h
 create mode 100644 paddle/phi/backends/dynload/musartc.cc
 create mode 100644 paddle/phi/backends/dynload/musartc.h
 create mode 100644 paddle/phi/backends/dynload/musparse.h
 create mode 100644 paddle/phi/backends/gpu/musa/musa_helper.h
 create mode 100644 paddle/phi/backends/musartc.h

diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index dc661fce388fe..7a9e3ebdd5fde 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -175,6 +175,11 @@ elseif(WITH_ROCM)
   if(${MIOPEN_VERSION} VERSION_LESS 2090)
     message(FATAL_ERROR "Paddle needs MIOPEN >= 2.9 to compile")
   endif()
+elseif(WITH_MUSA)
+  add_definitions(-DPADDLE_WITH_MUSA)
+  add_definitions(-DEIGEN_USE_GPU)
+  add_definitions(-DEIGEN_USE_MUSA)
+  list(APPEND DEPENDENT_INCLUDE_DIRS "/usr/local/musa/include/")
 else()
   add_definitions(-DHPPL_STUB_FUNC)
   list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
diff --git a/paddle/fluid/platform/device/gpu/gpu_types.h b/paddle/fluid/platform/device/gpu/gpu_types.h
index dac2add9f82c1..ba7b1ede735fe 100644
--- a/paddle/fluid/platform/device/gpu/gpu_types.h
+++ b/paddle/fluid/platform/device/gpu/gpu_types.h
@@ -25,6 +25,7 @@
 
 #elif defined(PADDLE_WITH_MUSA)
 #include <musa_runtime.h>
+#include <mublas.h>
 //TODO(Xiaokang Shang)
 #else
 #include <cuda_runtime.h>
@@ -51,11 +52,12 @@ namespace paddle {
 
 DECLARE_TYPE_FOR_GPU(gpuStream_t, cudaStream_t, hipStream_t, musaStream_t);
 DECLARE_TYPE_FOR_GPU(gpuError_t, cudaError_t, hipError_t, musaError_t);
-DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t, musaEvent_T);
+DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t, musaEvent_t);
 DECLARE_TYPE_FOR_GPU(gpuMemcpyKind, cudaMemcpyKind, hipMemcpyKind, musaMemcpyKind);
-DECLARE_TYPE_FOR_GPU(gpuDeviceProp, cudaDeviceProp, hipDeviceProp_t, musaDeviceProp_t);
+DECLARE_TYPE_FOR_GPU(gpuDeviceProp, cudaDeviceProp, hipDeviceProp_t, musaDeviceProp);
 
 // TODO(Xiaokang Shang): confirm mudnn type
+#if 0
 DECLARE_TYPE_FOR_GPU(dnnDataType_t, cudnnDataType_t, miopenDataType_t, mudnnDataType_t);
 DECLARE_TYPE_FOR_GPU(dnnActivationDescriptor,
                      cudnnActivationStruct,
@@ -90,12 +92,13 @@ DECLARE_TYPE_FOR_GPU(dnnDropoutDescriptor_t,
                      cudnnDropoutDescriptor_t,
                      miopenDropoutDescriptor_t);
 DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t, mudnnHandle_t);
+#endif
 
 DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle, mublasHandle_t);
 
 // TODO(Ming Huang): Since there is no blasLt handler,
 // use rocblas_handle for workround.
-DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle);
+//DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle);
 
 using CUDAGraphID = unsigned long long;  // NOLINT
 
diff --git a/paddle/fluid/platform/device/gpu/musa/musa_helper.h b/paddle/fluid/platform/device/gpu/musa/musa_helper.h
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 2aa336486308d..8d26ec716504d 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -42,6 +42,18 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
 
+#ifdef PADDLE_WITH_MUSA
+#include "paddle/fluid/platform/device/gpu/gpu_helper.h"
+#include "paddle/fluid/platform/dynload/mublas.h"
+#include "paddle/fluid/platform/dynload/mudnn.h"
+#include "paddle/fluid/platform/dynload/musparse.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#if !defined(__APPLE__) && defined(PADDLE_WITH_MCCL)
+#include "paddle/fluid/platform/dynload/mccl.h"
+#endif
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#endif
+
 #ifdef PADDLE_WITH_HIP
 #include "paddle/fluid/platform/device/gpu/gpu_helper.h"  // NOLINT
 #include "paddle/fluid/platform/dynload/miopen.h"
@@ -73,7 +85,7 @@ limitations under the License. */
 #include "paddle/phi/backends/stream.h"
 
 #if !defined(PADDLE_WITH_XPU_KP) || defined(__xpu_on_host__)
-#include "unsupported/Eigen/CXX11/Tensor"
+//#include "unsupported/Eigen/CXX11/Tensor"
 #endif
 
 namespace Eigen {
diff --git a/paddle/fluid/platform/dynload/mublas.h b/paddle/fluid/platform/dynload/mublas.h
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/paddle/fluid/platform/dynload/mudnn.h b/paddle/fluid/platform/dynload/mudnn.h
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/paddle/fluid/platform/dynload/musa_driver.cc b/paddle/fluid/platform/dynload/musa_driver.cc
new file mode 100644
index 0000000000000..2015bbed28cbd
--- /dev/null
+++ b/paddle/fluid/platform/dynload/musa_driver.cc
@@ -0,0 +1,24 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+bool HasCUDADriver() { return false; }
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
+
diff --git a/paddle/fluid/platform/dynload/musa_driver.h b/paddle/fluid/platform/dynload/musa_driver.h
new file mode 100644
index 0000000000000..a55f0bd70f967
--- /dev/null
+++ b/paddle/fluid/platform/dynload/musa_driver.h
@@ -0,0 +1,26 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+extern bool HasCUDADriver();
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
+
diff --git a/paddle/fluid/platform/dynload/musartc.cc b/paddle/fluid/platform/dynload/musartc.cc
new file mode 100644
index 0000000000000..5bc7b6737b3fb
--- /dev/null
+++ b/paddle/fluid/platform/dynload/musartc.cc
@@ -0,0 +1,24 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+bool HasNVRTC() { return false; }
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
+
diff --git a/paddle/fluid/platform/dynload/musartc.h b/paddle/fluid/platform/dynload/musartc.h
new file mode 100644
index 0000000000000..a81254119de57
--- /dev/null
+++ b/paddle/fluid/platform/dynload/musartc.h
@@ -0,0 +1,26 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+extern bool HasNVRTC();
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
+
diff --git a/paddle/fluid/platform/dynload/musparse.h b/paddle/fluid/platform/dynload/musparse.h
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index 160d6fb9912cb..72771dafe62fc 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -42,8 +42,8 @@ limitations under the License. */
 #include <mublas.h>
 #include <mudnn.h>
 #include <mufft.h>
-#include <murand.h>
-#include <musparse.h>
+//#include <murand.h>
+//#include <musparse.h>
 #include <thrust/system/musa/error.h>
 #include <thrust/system_error.h>
 #endif  // PADDLE_WITH_MUSA
diff --git a/paddle/phi/api/include/tensor.h b/paddle/phi/api/include/tensor.h
index b2c687a1f448d..ab7c298288d9d 100644
--- a/paddle/phi/api/include/tensor.h
+++ b/paddle/phi/api/include/tensor.h
@@ -25,8 +25,8 @@ using gpuStream_t = cudaStream_t;
 #endif
 
 #ifdef PADDLE_WITH_HIP
-#include <hip/hip_runtime.h>
-using gpuStream_t = hipStream_t;
+//#include <hip/hip_runtime.h>
+//using gpuStream_t = hipStream_t;
 #endif
 
 #ifdef PADDLE_WITH_MUSA
diff --git a/paddle/phi/api/lib/tensor_utils.cc b/paddle/phi/api/lib/tensor_utils.cc
index 3384b59158703..c96cf57f1ce6c 100644
--- a/paddle/phi/api/lib/tensor_utils.cc
+++ b/paddle/phi/api/lib/tensor_utils.cc
@@ -20,6 +20,8 @@ limitations under the License. */
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #ifdef PADDLE_WITH_CUDA
 #include <cuda_runtime.h>
+#elif defined(PADDLE_WITH_MUSA)
+#include <musa_runtime.h>
 #else
 #include <hip/hip_runtime.h>
 #endif
@@ -43,6 +45,12 @@ phi::Place GetPlaceFromPtr(void* data) {
       phi::errors::Unimplemented("The GetPlaceFromPtr() method is only "
                                  "supported when CUDA version >= 10.0."));
 #endif
+#elif defined(PADDLE_WITH_MUSA)
+  musaPointerAttributes attr;
+  musaError_t status = musaPointerGetAttributes(&attr, data);
+  if (status == musaSuccess && attr.type == musaMemoryTypeDevice) {
+    return phi::GPUPlace(attr.device);
+  }
 #else
   hipPointerAttribute_t attr;
   hipError_t status = hipPointerGetAttributes(&attr, data);
diff --git a/paddle/phi/backends/device_code.cc b/paddle/phi/backends/device_code.cc
index 529e42fc4c95b..33b8f3a320aac 100644
--- a/paddle/phi/backends/device_code.cc
+++ b/paddle/phi/backends/device_code.cc
@@ -24,7 +24,9 @@ limitations under the License. */
 #include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/flags.h"
-
+#ifdef PADDLE_WITH_MUSA
+#include <musa.h>
+#endif
 PHI_DECLARE_string(cuda_dir);
 
 namespace phi {
@@ -107,7 +109,7 @@ static bool CheckCUDADriverResult(MUresult result,
                                   std::string kernel_name = "") {
   if (result != MUSA_SUCCESS) {
     const char* error = nullptr;
-    dynload::muGetErrorString(result, &error);
+    muGetErrorString(result, &error);
 #else
 static bool CheckCUDADriverResult(CUresult result,
                                   std::string caller,
@@ -138,7 +140,7 @@ void GPUDeviceCode::CheckAvailableStatus() {
   hiprtcResult nvrtc_result =
       dynload::hiprtcVersion(&nvrtc_major, &nvrtc_minor);
 #elif defined(PADDLE_WITH_MUSA)
-  nvrtcResult nvrtc_result = dynload::nvrtcVersion(&nvrtc_major, &nvrtc_minor);
+
 #else
   nvrtcResult nvrtc_result = dynload::nvrtcVersion(&nvrtc_major, &nvrtc_minor);
 #endif
@@ -150,7 +152,7 @@ void GPUDeviceCode::CheckAvailableStatus() {
   hipError_t driver_result = dynload::hipDriverGetVersion(&driver_version);
   if (driver_result == hipSuccess) {
 #elif defined(PADDLE_WITH_MUSA)
-  MUresult driver_result = dynload::muDriverGetVersion(&driver_version);
+  MUresult driver_result = muDriverGetVersion(&driver_version);
   if (driver_result == MUSA_SUCCESS) {
 #else
   CUresult driver_result = dynload::cuDriverGetVersion(&driver_version);
@@ -166,7 +168,7 @@ void GPUDeviceCode::CheckAvailableStatus() {
 #ifdef PADDLE_WITH_HIP
   if (nvrtc_result != HIPRTC_SUCCESS || driver_result != hipSuccess) {
 #elif defined(PADDLE_WITH_MUSA)
-  if (nvrtc_result != NVRTC_SUCCESS || driver_result != MUSA_SUCCESS) {
+  if (false) {
 #else
   if (nvrtc_result != NVRTC_SUCCESS || driver_result != CUDA_SUCCESS) {
 #endif
@@ -178,7 +180,7 @@ void GPUDeviceCode::CheckAvailableStatus() {
   if (CheckCUDADriverResult(dynload::hipGetDeviceCount(&count),
                             "hipGetDeviceCount")) {
 #elif defined(PADDLE_WITH_MUSA)
-  if (CheckCUDADriverResult(dynload::muDeviceGetCount(&count),
+  if (CheckCUDADriverResult(muDeviceGetCount(&count),
                             "muDeviceGetCount")) {
 #else
   if (CheckCUDADriverResult(dynload::cuDeviceGetCount(&count),
@@ -340,85 +342,10 @@ bool GPUDeviceCode::Compile(bool include_path) {
     return false;
   }
 #elif defined(PADDLE_WITH_MUSA)
-  nvrtcProgram program;
-  if (!CheckNVRTCResult(dynload::nvrtcCreateProgram(&program,
-                                                    kernel_.c_str(),  // buffer
-                                                    name_.c_str(),    // name
-                                                    0,         // numHeaders
-                                                    nullptr,   // headers
-                                                    nullptr),  // includeNames
-                        "nvrtcCreateProgram")) {
-    return false;
-  }
-
-  // Compile the program for specified compute_capability
   auto* dev_ctx = reinterpret_cast<phi::GPUContext*>(
       DeviceContextPool::Instance().Get(place_));
-  int compute_capability = dev_ctx->GetComputeCapability();
-  std::string compute_flag =
-      "--gpu-architecture=compute_" + std::to_string(compute_capability);
-  std::vector<const char*> options = {"--std=c++11", compute_flag.c_str()};
-  std::string include_option;
-  if (include_path) {
-    std::string cuda_include_path = FindMUSAIncludePath();
-    if (!cuda_include_path.empty()) {
-      include_option = "--include-path=" + cuda_include_path;
-      options.push_back(include_option.c_str());
-    }
-  }
-  nvrtcResult compile_result =
-      dynload::nvrtcCompileProgram(program,          // program
-                                   options.size(),   // numOptions
-                                   options.data());  // options
-  if (compile_result == NVRTC_ERROR_COMPILATION) {
-    // Obtain compilation log from the program
-    size_t log_size;
-    if (!CheckNVRTCResult(dynload::nvrtcGetProgramLogSize(program, &log_size),
-                          "nvrtcGetProgramLogSize")) {
-      return false;
-    }
-    std::vector<char> log;
-    log.resize(log_size + 1);
-    if (!CheckNVRTCResult(dynload::nvrtcGetProgramLog(program, log.data()),
-                          "nvrtcGetProgramLog")) {
-      return false;
-    }
-    LOG(WARNING) << "JIT compiling of CUDA code failed:"
-                 << "\n  Kernel name: " << name_ << "\n  Kernel body:\n"
-                 << kernel_ << "\n  Compiling log: " << log.data();
-
-    return false;
-  }
-
-  // Obtain PTX from the program
-  size_t ptx_size;
-  if (!CheckNVRTCResult(dynload::nvrtcGetPTXSize(program, &ptx_size),
-                        "nvrtcGetPTXSize")) {
-    return false;
-  }
-  ptx_.resize(ptx_size + 1);
-  if (!CheckNVRTCResult(dynload::nvrtcGetPTX(program, ptx_.data()),
-                        "nvrtcGetPTX")) {
-    return false;
-  }
-
-  if (!CheckNVRTCResult(dynload::nvrtcDestroyProgram(&program),
-                        "nvrtcDestroyProgram")) {
-    return false;
-  }
-
-  if (!CheckCUDADriverResult(dynload::muModuleLoadData(&module_, ptx_.data()),
-                             "muModuleLoadData",
-                             name_)) {
-    return false;
-  }
-
-  if (!CheckCUDADriverResult(
-          dynload::muModuleGetFunction(&function_, module_, name_.c_str()),
-          "muModuleGetFunction",
-          name_)) {
-    return false;
-  }
+  is_compiled_ = false;
+  return false;
 #else
   nvrtcProgram program;
   if (!CheckNVRTCResult(dynload::nvrtcCreateProgram(&program,
@@ -539,7 +466,7 @@ void GPUDeviceCode::Launch(const size_t n, std::vector<void*>* args) const {
                        name_.c_str()));
 #elif defined(PADDLE_WITH_MUSA)
   PADDLE_ENFORCE_EQ(
-      dynload::muLaunchKernel(function_,
+      muLaunchKernel(function_,
                               num_blocks,
                               1,
                               1,  // grid dim
@@ -581,15 +508,10 @@ bool GPUDeviceCode::CheckNVRTCResult(hiprtcResult result,
         << " > failed: " << dynload::hiprtcGetErrorString(result);
     return false;
   }
-#elif defined(PADDLE_WITH_MUSA)
-bool GPUDeviceCode::CheckNVRTCResult(nvrtcResult result, std::string function) {
-  if (result != NVRTC_SUCCESS) {
-    LOG_FIRST_N(WARNING, 1)
-        << "Call " << function << " for < " << name_
-        << " > failed: " << dynload::nvrtcGetErrorString(result);
-    return false;
-  }
-#else
+  return true;
+}
+#endif
+#ifdef PADDLE_WITH_CUDA
 bool GPUDeviceCode::CheckNVRTCResult(nvrtcResult result, std::string function) {
   if (result != NVRTC_SUCCESS) {
     LOG_FIRST_N(WARNING, 1)
@@ -597,9 +519,9 @@ bool GPUDeviceCode::CheckNVRTCResult(nvrtcResult result, std::string function) {
         << " > failed: " << dynload::nvrtcGetErrorString(result);
     return false;
   }
-#endif
   return true;
 }
 #endif
+#endif
 
 }  // namespace phi
diff --git a/paddle/phi/backends/device_code.h b/paddle/phi/backends/device_code.h
index 63d221ea8c89a..5721f8f04768e 100644
--- a/paddle/phi/backends/device_code.h
+++ b/paddle/phi/backends/device_code.h
@@ -27,8 +27,8 @@ limitations under the License. */
 #include "paddle/phi/backends/dynload/nvrtc.h"
 #endif
 #ifdef PADDLE_WITH_MUSA
+#include "paddle/phi/backends/dynload/musartc.h"
 #include "paddle/phi/backends/dynload/musa_driver.h"
-#include "paddle/phi/backends/dynload/nvrtc.h"
 #endif
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/backends/dynload/hiprtc.h"
@@ -73,7 +73,7 @@ class GPUDeviceCode : public DeviceCode {
 #ifdef PADDLE_WITH_HIP
   bool CheckNVRTCResult(hiprtcResult result, std::string function);
 #elif defined(PADDLE_WITH_MUSA)
-  bool CheckNVRTCResult(cudartcResult result, std::string function);
+
 #else
   bool CheckNVRTCResult(nvrtcResult result, std::string function);
 #endif
diff --git a/paddle/phi/backends/dynload/mublas.h b/paddle/phi/backends/dynload/mublas.h
new file mode 100644
index 0000000000000..bbba96fa497a2
--- /dev/null
+++ b/paddle/phi/backends/dynload/mublas.h
@@ -0,0 +1,6 @@
+
+#include <mublas.h>
+namespace phi {
+namespace dynload {
+}  // namespace dynload
+}  // namespace phi
diff --git a/paddle/phi/backends/dynload/mudnn.cc b/paddle/phi/backends/dynload/mudnn.cc
new file mode 100644
index 0000000000000..19ada8408ed17
--- /dev/null
+++ b/paddle/phi/backends/dynload/mudnn.cc
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+namespace phi {
+namespace dynload {
+
+bool HasCUDNN() {
+  return false;
+}
+
+}  // namespace dynload
+}  // namespace phi
+
diff --git a/paddle/phi/backends/dynload/mudnn.h b/paddle/phi/backends/dynload/mudnn.h
new file mode 100644
index 0000000000000..c96a2570210d2
--- /dev/null
+++ b/paddle/phi/backends/dynload/mudnn.h
@@ -0,0 +1,28 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#ifdef PADDLE_WITH_MUSA
+
+namespace phi {
+namespace dynload {
+
+extern bool HasCUDNN();
+
+
+}  // namespace dynload
+}  // namespace phi
+
+#endif
+
diff --git a/paddle/phi/backends/dynload/murand.h b/paddle/phi/backends/dynload/murand.h
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/paddle/phi/backends/dynload/musa_driver.cc b/paddle/phi/backends/dynload/musa_driver.cc
new file mode 100644
index 0000000000000..009dda42ceebf
--- /dev/null
+++ b/paddle/phi/backends/dynload/musa_driver.cc
@@ -0,0 +1,24 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+namespace phi {
+namespace dynload {
+
+bool HasCUDADriver() {
+  return false;
+}
+
+}  // namespace dynload
+}  // namespace phi
+
diff --git a/paddle/phi/backends/dynload/musa_driver.h b/paddle/phi/backends/dynload/musa_driver.h
new file mode 100644
index 0000000000000..1363d135d5f7e
--- /dev/null
+++ b/paddle/phi/backends/dynload/musa_driver.h
@@ -0,0 +1,24 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace phi {
+namespace dynload {
+
+extern bool HasCUDADriver();
+
+}  // namespace dynload
+}  // namespace phi
+
diff --git a/paddle/phi/backends/dynload/musartc.cc b/paddle/phi/backends/dynload/musartc.cc
new file mode 100644
index 0000000000000..cf14ae70a01a1
--- /dev/null
+++ b/paddle/phi/backends/dynload/musartc.cc
@@ -0,0 +1,24 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+namespace phi {
+namespace dynload {
+
+bool HasNVRTC() {
+  return false;
+}
+
+}  // namespace dynload
+}  // namespace phi
diff --git a/paddle/phi/backends/dynload/musartc.h b/paddle/phi/backends/dynload/musartc.h
new file mode 100644
index 0000000000000..dc9ebc3faf0d7
--- /dev/null
+++ b/paddle/phi/backends/dynload/musartc.h
@@ -0,0 +1,24 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace phi {
+namespace dynload {
+
+extern bool HasNVRTC();
+
+}  // namespace dynload
+}  // namespace phi
+
diff --git a/paddle/phi/backends/dynload/musparse.h b/paddle/phi/backends/dynload/musparse.h
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/paddle/phi/backends/gpu/forwards.h b/paddle/phi/backends/gpu/forwards.h
index e1f3492f76870..7a66475b13aa9 100644
--- a/paddle/phi/backends/gpu/forwards.h
+++ b/paddle/phi/backends/gpu/forwards.h
@@ -72,6 +72,11 @@ using cufftHandle = int;
 // Forward declaration of NCCL types.
 using ncclComm_t = struct ncclComm *;
 
+// Forward declaration of MUSA runtime types.
+using musaStream_t = struct MUstream_st *;
+using musaEvent_t = struct MUevent_st *;
+using mublasHandle_t = struct _mublasHandle_t*;
+
 /// Forward declaration of ROCM types.
 #include <cstddef>
 
diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc
index e954c7db337aa..2bcf665b8fd61 100644
--- a/paddle/phi/backends/gpu/gpu_context.cc
+++ b/paddle/phi/backends/gpu/gpu_context.cc
@@ -46,7 +46,6 @@ limitations under the License. */
 #ifdef PADDLE_WITH_MUSA
 #include "paddle/phi/backends/dynload/mublas.h"
 #include "paddle/phi/backends/dynload/mudnn.h"
-#include "paddle/phi/backends/dynload/musolver.h"
 #include "paddle/phi/backends/dynload/musparse.h"
 #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
 #include "paddle/phi/backends/dynload/mccl.h"
@@ -62,10 +61,6 @@ limitations under the License. */
 #endif  // !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
 #endif  // PADDLE_WITH_HIP
 
-#ifdef PADDLE_WITH_MUSA
-
-#endif
-
 // NOTE: The paddle framework should add WITH_EIGEN option to support compile
 // without eigen.
 #include "unsupported/Eigen/CXX11/Tensor"
@@ -164,10 +159,10 @@ static void StreamCallbackFunc(gpuStream_t stream,
 
 #ifdef PADDLE_WITH_MUSA
 #if MUSA_VERSION >= 10000
-    static void MUDART_CB StreamCallbackFunc(void* user_data)
+    static void StreamCallbackFunc(void* user_data)
 #else
-    static void MUDART_CB
-    StreamCallbackFunc(musaStream_t stream, musaError_t status, void* user_data)
+    static void
+    StreamCallbackFunc(cudaStream_t stream, cudaError_t status, void* user_data)
 #endif
 #endif
 
@@ -497,10 +492,7 @@ struct GPUContext::Impl {
       dnn_handle_ = nullptr;
     }
 #elif defined(PADDLE_WITH_MUSA)
-    if (owned_ && dnn_handle_ != nullptr) {
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnDestroy(dnn_handle_));
-      dnn_handle_ = nullptr;
-    }
+
 #else
     if (owned_ && dnn_handle_ != nullptr) {
       PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDestroy(dnn_handle_));
@@ -578,7 +570,7 @@ struct GPUContext::Impl {
     }
 #endif  // !defined(_WIN32)
 
-#else   // PADDLE_WITH_HIP
+#else   // PADDLE_WITH_MUSA
     cudaError_t e_sync = cudaSuccess;
 #if !defined(_WIN32)
     e_sync = cudaStreamSynchronize(stream());
@@ -588,7 +580,7 @@ struct GPUContext::Impl {
       break;
     }
 #endif  // !defined(_WIN32)
-#endif  // PADDLE_WITH_HIP
+#endif  // PADDLE_WITH_CUDA
 
     PADDLE_ENFORCE_GPU_SUCCESS(e_sync);
   }
@@ -775,7 +767,7 @@ struct GPUContext::Impl {
   }
 
   void WaitStreamCallback() const {
-#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_MUSA)
     phi::backends::gpu::GpuStreamSync(stream());
 #endif
     {
diff --git a/paddle/phi/backends/gpu/gpu_decls.h b/paddle/phi/backends/gpu/gpu_decls.h
index 93dba9764478a..d6f42ff743e58 100644
--- a/paddle/phi/backends/gpu/gpu_decls.h
+++ b/paddle/phi/backends/gpu/gpu_decls.h
@@ -20,22 +20,25 @@
 namespace phi {
 
 #ifdef PADDLE_WITH_HIP
-#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \
   using GPU_TYPE = ROCM_TYPE;
 
 #elif defined(PADDLE_WITH_MUSA)
-
-#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
+  using mudnnHandle_t = bool**;
+  using mublasLtHandle_t = bool**;
+  using musparseHandle_t = bool**;
+  using musolverDnHandle_t = bool**;
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \
   using GPU_TYPE = MUSA_TYPE;
-#else  // PADDLE_WITH_CDUA
 
-#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
+#else
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \
   using GPU_TYPE = CUDA_TYPE;
-#endif
-
-DECLARE_TYPE_FOR_GPU(gpuStream_t, cudaStream_t, hipStream_t);
-DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t);
+#endif  // PADDLE_WITH_CDUA
 
+DECLARE_TYPE_FOR_GPU(gpuStream_t, cudaStream_t, hipStream_t, musaStream_t);
+DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t, musaEvent_t);
+#if 0
 DECLARE_TYPE_FOR_GPU(dnnActivationDescriptor,
                      cudnnActivationStruct,
                      miopenActivationDescriptor);
@@ -60,18 +63,17 @@ DECLARE_TYPE_FOR_GPU(dnnPoolingDescriptor_t,
 DECLARE_TYPE_FOR_GPU(dnnDropoutDescriptor_t,
                      cudnnDropoutDescriptor_t,
                      miopenDropoutDescriptor_t);
-DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t);
-
-DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle);
+#endif
 
 // TODO(Ming Huang): Since there is no blasLt handler,
 // use rocblas_handle for workround.
-DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle);
-
-DECLARE_TYPE_FOR_GPU(solverHandle_t, cusolverDnHandle_t, rocsolver_handle);
+DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle, mublasLtHandle_t);
 
-DECLARE_TYPE_FOR_GPU(sparseHandle_t, cusparseHandle_t, rocsparse_handle);
+DECLARE_TYPE_FOR_GPU(solverHandle_t, cusolverDnHandle_t, rocsolver_handle, musolverDnHandle_t);
 
+DECLARE_TYPE_FOR_GPU(sparseHandle_t, cusparseHandle_t, rocsparse_handle, musparseHandle_t);
+DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t, mudnnHandle_t);
+DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle, mublasHandle_t);
 #undef DECLARE_TYPE_FOR_GPU
 
 using CUDAGraphID = unsigned long long;  // NOLINT
diff --git a/paddle/phi/backends/gpu/gpu_dnn.h b/paddle/phi/backends/gpu/gpu_dnn.h
index 30cf3fae80519..dfb13e29dbf89 100644
--- a/paddle/phi/backends/gpu/gpu_dnn.h
+++ b/paddle/phi/backends/gpu/gpu_dnn.h
@@ -20,8 +20,7 @@
 #include "paddle/phi/backends/gpu/rocm/miopen_desc.h"
 #include "paddle/phi/backends/gpu/rocm/miopen_helper.h"
 #elif defined(PADDLE_WITH_MUSA)
-#include "paddle/phi/backends/gpu/musa/mudnn_desc.h"
-#include "paddle/phi/backends/gpu/musa/mudnn_helper.h"
+
 #else  // CUDA
 #include "paddle/phi/backends/gpu/cuda/cudnn_desc.h"
 #include "paddle/phi/backends/gpu/cuda/cudnn_helper.h"
diff --git a/paddle/phi/backends/gpu/gpu_launch_config.h b/paddle/phi/backends/gpu/gpu_launch_config.h
index 00aa244041bec..675353e011498 100644
--- a/paddle/phi/backends/gpu/gpu_launch_config.h
+++ b/paddle/phi/backends/gpu/gpu_launch_config.h
@@ -20,6 +20,8 @@
 
 #ifdef PADDLE_WITH_CUDA
 #include <cuda_runtime.h>
+#elif defined(PADDLE_WITH_MUSA)
+#include <musa_runtime.h>
 #else
 #include <hip/hip_runtime.h>
 #endif
diff --git a/paddle/phi/backends/gpu/gpu_resources.cc b/paddle/phi/backends/gpu/gpu_resources.cc
index b60d0cccd3dc5..e13d318942e06 100644
--- a/paddle/phi/backends/gpu/gpu_resources.cc
+++ b/paddle/phi/backends/gpu/gpu_resources.cc
@@ -37,9 +37,7 @@
 
 #ifdef PADDLE_WITH_MUSA
 #include "paddle/phi/backends/dynload/mublas.h"
-#include "paddle/phi/backends/dynload/mublasLt.h"
 #include "paddle/phi/backends/dynload/mudnn.h"
-#include "paddle/phi/backends/dynload/musolver.h"
 #include "paddle/phi/backends/dynload/musparse.h"
 #if !defined(__APPLE__) && defined(PADDLE_WITH_MCCL)
 #include "paddle/phi/backends/dynload/mccl.h"
@@ -158,7 +156,8 @@ void InitGpuProperties(Place place,
            "version.";
   }
 #elif defined(PADDLE_WITH_MUSA)
-  size_t mudnn_dso_ver = dynload::mudnnGetVersion();
+  //size_t mudnn_dso_ver = dynload::mudnnGetVersion();
+  size_t mudnn_dso_ver = 0;
   LOG_FIRST_N(WARNING, 1) << "device: " << static_cast<int>(place.device)
                           << ", muDNN Version: " << mudnn_dso_ver / 1000 << "."
                           << (mudnn_dso_ver % 1000) / 100 << ".";
@@ -184,15 +183,15 @@ void InitGpuProperties(Place place,
           local_musa_version / 10,
           mudnn_dso_ver / 1000));
 #endif
-  if (local_cuda_version < compile_cuda_version) {
+  if (local_musa_version < compile_musa_version) {
     LOG_FIRST_N(WARNING, 1)
         << "WARNING: device: " << static_cast<int>(place.device)
-        << ". The installed Paddle is compiled with CUDA "
-        << compile_cuda_version / 10 << "." << compile_cuda_version % 10
-        << ", but CUDA runtime version in your machine is "
-        << local_cuda_version / 10 << "." << local_cuda_version % 10
+        << ". The installed Paddle is compiled with MUSA "
+        << compile_musa_version / 10 << "." << compile_musa_version % 10
+        << ", but MUSA runtime version in your machine is "
+        << local_musa_version / 10 << "." << local_musa_version % 10
         << ", which may cause serious incompatible bug. "
-        << "Please recompile or reinstall Paddle with compatible CUDA "
+        << "Please recompile or reinstall Paddle with compatible MUSA "
            "version.";
   }
 #else
@@ -267,9 +266,9 @@ void InitBlasHandle(blasHandle_t* blas_handle, gpuStream_t stream) {
   phi::dynload::rocblas_create_handle(blas_handle);
   phi::dynload::rocblas_set_stream(*blas_handle, stream);
 #elif defined(PADDLE_WITH_MUSA)
-  PADDLE_RETRY_MUSA_SUCCESS(phi::dynload::mublasCreate(blas_handle));
-  PADDLE_RETRY_MUSA_SUCCESS(
-      phi::dynload::mublasSetStream(*blas_handle, stream));
+  PADDLE_RETRY_CUDA_SUCCESS(mublasCreate(blas_handle));
+  PADDLE_RETRY_CUDA_SUCCESS(
+      mublasSetStream(*blas_handle, stream));
 #else   // PADDLE_WITH_MUSA
   PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasCreate(blas_handle));
   PADDLE_RETRY_CUDA_SUCCESS(
@@ -285,7 +284,7 @@ void DestroyBlasHandle(blasHandle_t handle) {
   }
 #elif defined(PADDLE_WITH_MUSA)
   if (handle != nullptr) {
-    phi::dynload::mublasDestroy(handle);
+    mublasDestroy(handle);
     handle = nullptr;
   }
 #else
@@ -334,21 +333,7 @@ void InitDnnHandle(dnnHandle_t* handle, gpuStream_t stream, Place place) {
     PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreate(handle));
     PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetStream(*handle, stream));
 #elif defined(PADDLE_WITH_MUSA)
-    auto local_cudnn_version = phi::dynload::mudnnGetVersion() / 100;
-    auto compile_mudnn_version = MUDNN_VERSION / 100;
-    if (local_mudnn_version < static_cast<size_t>(compile_mudnn_version)) {
-      LOG_FIRST_N(WARNING, 1)
-          << "WARNING: device: " << place.device
-          << ". The installed Paddle is compiled with MUDNN "
-          << compile_mudnn_version / 10 << "." << compile_mudnn_version % 10
-          << ", but MUDNN version in your machine is "
-          << local_mudnn_version / 10 << "." << local_mudnn_version % 10
-          << ", which may cause serious incompatible bug. "
-          << "Please recompile or reinstall Paddle with compatible MUDNN "
-             "version.";
-    }
-    PADDLE_RETRY_MUSA_SUCCESS(phi::dynload::mudnnCreate(handle));
-    PADDLE_RETRY_MUSA_SUCCESS(phi::dynload::mudnnSetStream(*handle, stream));
+
 #else
     auto local_cudnn_version = phi::dynload::cudnnGetVersion() / 100;
     auto compile_cudnn_version = CUDNN_VERSION / 100;
@@ -378,10 +363,7 @@ void DestroyDnnHandle(dnnHandle_t handle) {
     handle = nullptr;
   }
 #elif defined(PADDLE_WITH_MUSA)
-  if (handle != nullptr) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnDestroy(handle));
-    handle = nullptr;
-  }
+
 #else
   if (handle != nullptr) {
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDestroy(handle));
@@ -391,14 +373,14 @@ void DestroyDnnHandle(dnnHandle_t handle) {
 }
 
 void InitSolverHandle(solverHandle_t* handle, gpuStream_t stream) {
-#ifndef PADDLE_WITH_HIP
+#ifdef PADDLE_WITH_CUDA
   PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cusolverDnCreate(handle));
   PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cusolverDnSetStream(*handle, stream));
 #endif
 }
 
 void DestroySolverHandle(solverHandle_t solver_handle) {
-#ifndef PADDLE_WITH_HIP
+#ifdef PADDLE_WITH_CUDA
   if (solver_handle != nullptr) {
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnDestroy(solver_handle));
     solver_handle = nullptr;
diff --git a/paddle/phi/backends/gpu/gpu_types.h b/paddle/phi/backends/gpu/gpu_types.h
index 36e094f4a0814..21230d6b22701 100644
--- a/paddle/phi/backends/gpu/gpu_types.h
+++ b/paddle/phi/backends/gpu/gpu_types.h
@@ -25,6 +25,7 @@
 #elif defined(PADDLE_WITH_MUSA)
 #include "paddle/phi/backends/dynload/mublas.h"
 #include "paddle/phi/backends/dynload/mudnn.h"
+#include <musa_runtime.h>
 #else  // PADDLE_WITH_CUDA
 #include "paddle/phi/backends/dynload/cublas.h"
 #include "paddle/phi/backends/dynload/cudnn.h"
@@ -33,57 +34,61 @@
 namespace phi {
 
 #ifdef PADDLE_WITH_HIP
-#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \
   using GPU_TYPE = ROCM_TYPE;
 #elif defined(PADDLE_WITH_MUSA)
-#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \
   using GPU_TYPE = MUSA_TYPE;
 #else  // PADDLE_WITH_CDUA
 
-#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \
   using GPU_TYPE = CUDA_TYPE;
 #endif
 
-DECLARE_TYPE_FOR_GPU(gpuError_t, cudaError_t, hipError_t);
-DECLARE_TYPE_FOR_GPU(gpuMemcpyKind, cudaMemcpyKind, hipMemcpyKind);
-DECLARE_TYPE_FOR_GPU(gpuDeviceProp, cudaDeviceProp, hipDeviceProp_t);
-DECLARE_TYPE_FOR_GPU(dnnDataType_t, cudnnDataType_t, miopenDataType_t);
-DECLARE_TYPE_FOR_GPU(dnnPoolingMode_t, cudnnPoolingMode_t, miopenPoolingMode_t);
-DECLARE_TYPE_FOR_GPU(dnnTensorFormat_t,
-                     cudnnTensorFormat_t,
-                     miopenTensorFormat_t);
-DECLARE_TYPE_FOR_GPU(dnnActivationMode_t,
-                     cudnnActivationMode_t,
-                     miopenActivationMode_t);
+DECLARE_TYPE_FOR_GPU(gpuError_t, cudaError_t, hipError_t, musaError_t);
+DECLARE_TYPE_FOR_GPU(gpuMemcpyKind, cudaMemcpyKind, hipMemcpyKind, musaMemcpyKind);
+DECLARE_TYPE_FOR_GPU(gpuDeviceProp, cudaDeviceProp, hipDeviceProp_t, musaDeviceProp);
+//DECLARE_TYPE_FOR_GPU(dnnDataType_t, cudnnDataType_t, miopenDataType_t);
+//DECLARE_TYPE_FOR_GPU(dnnPoolingMode_t, cudnnPoolingMode_t, miopenPoolingMode_t);
+//DECLARE_TYPE_FOR_GPU(dnnTensorFormat_t,
+//                     cudnnTensorFormat_t,
+//                     miopenTensorFormat_t);
+//DECLARE_TYPE_FOR_GPU(dnnActivationMode_t,
+//                     cudnnActivationMode_t,
+//                     miopenActivationMode_t);
 
 #undef DECLARE_TYPE_FOR_GPU
 
 #ifdef PADDLE_WITH_HIP
-#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \
+#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \
   constexpr auto GPU_CV = ROCM_CV;
 #elif defined(PADDLE_WITH_MUSA)
-#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \
+#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \
   constexpr auto GPU_CV = MUSA_CV;
 #else  // PADDLE_WITH_CUDA
-#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \
+#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \
   constexpr auto GPU_CV = CUDA_CV;
 #endif
 
 DECLARE_CONSTANT_FOR_GPU(gpuErrorOutOfMemory,
                          cudaErrorMemoryAllocation,
-                         hipErrorOutOfMemory);
-DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady, cudaErrorNotReady, hipErrorNotReady);
-DECLARE_CONSTANT_FOR_GPU(gpuSuccess, cudaSuccess, hipSuccess);
+                         hipErrorOutOfMemory,
+                         musaErrorMemoryAllocation);
+DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady, cudaErrorNotReady, hipErrorNotReady, musaErrorNotReady);
+DECLARE_CONSTANT_FOR_GPU(gpuSuccess, cudaSuccess, hipSuccess, musaSuccess);
 
 DECLARE_CONSTANT_FOR_GPU(gpuMemcpyHostToDevice,
                          cudaMemcpyKind::cudaMemcpyHostToDevice,
-                         hipMemcpyKind::hipMemcpyHostToDevice);
+                         hipMemcpyKind::hipMemcpyHostToDevice,
+                         musaMemcpyKind::musaMemcpyHostToDevice);
 DECLARE_CONSTANT_FOR_GPU(gpuMemcpyDeviceToHost,
                          cudaMemcpyKind::cudaMemcpyDeviceToHost,
-                         hipMemcpyKind::hipMemcpyDeviceToHost);
+                         hipMemcpyKind::hipMemcpyDeviceToHost,
+                         musaMemcpyKind::musaMemcpyDeviceToHost);
 DECLARE_CONSTANT_FOR_GPU(gpuMemcpyDeviceToDevice,
                          cudaMemcpyKind::cudaMemcpyDeviceToDevice,
-                         hipMemcpyKind::hipMemcpyDeviceToDevice);
+                         hipMemcpyKind::hipMemcpyDeviceToDevice,
+                         musaMemcpyKind::musaMemcpyDeviceToDevice);
 
 #undef DECLARE_CONSTANT_FOR_GPU
 }  // namespace phi
diff --git a/paddle/phi/backends/gpu/musa/musa_helper.h b/paddle/phi/backends/gpu/musa/musa_helper.h
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/paddle/phi/backends/gpu/musa/musa_info.cc b/paddle/phi/backends/gpu/musa/musa_info.cc
index f2087e4d7f4fc..ced106d6c6b3d 100644
--- a/paddle/phi/backends/gpu/musa/musa_info.cc
+++ b/paddle/phi/backends/gpu/musa/musa_info.cc
@@ -13,12 +13,14 @@
 // limitations under the License.
 
 #include <array>
+#include <mutex>
 
 #include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/fluid/framework/fleet/heter_ps/log_patch.h"
 
 #include "paddle/phi/core/enforce.h"
 
-#include "musa_runtime_api.h"
+#include "musa_runtime.h"
 
 static std::once_flag g_device_props_size_init_flag;
 static std::vector<std::unique_ptr<std::once_flag>> g_device_props_init_flags;
@@ -29,7 +31,7 @@ namespace backends {
 namespace gpu {
 
 int DnnVersion() {
-  return 0.0.0;
+  return 0;
   //if (!dynload::HasCUDNN()) return -1;
   //size_t version_major, version_minor, version_patch;
   //PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenGetVersion(
diff --git a/paddle/phi/backends/musartc.h b/paddle/phi/backends/musartc.h
new file mode 100644
index 0000000000000..dc9ebc3faf0d7
--- /dev/null
+++ b/paddle/phi/backends/musartc.h
@@ -0,0 +1,24 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace phi {
+namespace dynload {
+
+extern bool HasNVRTC();
+
+}  // namespace dynload
+}  // namespace phi
+
diff --git a/paddle/phi/core/enforce.h b/paddle/phi/core/enforce.h
index cda5a3a49c528..aaa3eebfe27a5 100644
--- a/paddle/phi/core/enforce.h
+++ b/paddle/phi/core/enforce.h
@@ -36,11 +36,11 @@ limitations under the License. */
 #endif  // PADDLE_WITH_CUDA
 
 #ifdef PADDLE_WITH_MUSA
-#include <mublas_v2.h>
+#include <mublas.h>
 #include <mudnn.h>
 #include <mufft.h>
-#include <murand.h>
-#include <musparse.h>
+//#include <murand.h>
+//#include <musparse.h>
 #include <thrust/system/musa/error.h>
 #include <thrust/system_error.h>
 #endif  // PADDLE_WITH_MUSA
@@ -90,7 +90,6 @@ limitations under the License. */
 #include "paddle/phi/backends/dynload/mublas.h"
 #include "paddle/phi/backends/dynload/mudnn.h"
 #include "paddle/phi/backends/dynload/murand.h"
-#include "paddle/phi/backends/dynload/musolver.h"
 #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
 #include <error.h>
 
@@ -854,6 +853,180 @@ inline void retry_sleep(unsigned milliseconds) {
 #undef DEFINE_EXTERNAL_API_TYPE
 #endif  // PADDLE_WITH_CUDA
 
+/************************************************************************/
+/**************************** MUSA ERROR ********************************/
+#ifdef PADDLE_WITH_MUSA
+
+namespace details {
+
+template <typename T>
+struct ExternalApiType {};
+
+#define DEFINE_EXTERNAL_API_TYPE(type, success_value) \
+  template <>                                         \
+  struct ExternalApiType<type> {                      \
+    using Type = type;                                \
+    static constexpr Type kSuccess = success_value;   \
+  }
+
+DEFINE_EXTERNAL_API_TYPE(musaError_t, musaSuccess);
+//DEFINE_EXTERNAL_API_TYPE(murandStatus_t, MURAND_STATUS_SUCCESS);
+//DEFINE_EXTERNAL_API_TYPE(cudnnStatus_t, CUDNN_STATUS_SUCCESS);
+DEFINE_EXTERNAL_API_TYPE(mublasStatus_t, MUBLAS_STATUS_SUCCESS);
+//DEFINE_EXTERNAL_API_TYPE(cusparseStatus_t, CUSPARSE_STATUS_SUCCESS);
+//DEFINE_EXTERNAL_API_TYPE(cusolverStatus_t, CUSOLVER_STATUS_SUCCESS);
+//DEFINE_EXTERNAL_API_TYPE(cufftResult_t, CUFFT_SUCCESS);
+//DEFINE_EXTERNAL_API_TYPE(CUresult, CUDA_SUCCESS);
+
+#if !defined(__APPLE__) && defined(PADDLE_WITH_MCCL)
+DEFINE_EXTERNAL_API_TYPE(mcclResult_t, mcclSuccess);
+#endif
+
+}  // namespace details
+
+/*************** MUSA ERROR ***************/
+inline bool is_error(musaError_t e) { return e != musaSuccess; }
+
+inline std::string build_musa_error_msg(musaError_t e) {
+  std::ostringstream sout;
+  sout << "MUSA error(" << e << "), " << musaGetErrorString(e) << ". ";
+  return sout.str();
+}
+
+///*************** MURAND ERROR ***************/
+//inline bool is_error(murandStatus_t stat) {
+//  return stat != MURAND_STATUS_SUCCESS;
+//}
+//
+//inline std::string build_musa_error_msg(murandStatus_t stat) {
+//  std::ostringstream sout;
+//  sout << "MURAND error(" << stat << "). " << GetExternalErrorMsg(stat);
+//  return sout.str();
+//}
+
+/*************** MUBLAS ERROR ***************/
+inline bool is_error(mublasStatus_t stat) {
+  return stat != MUBLAS_STATUS_SUCCESS;
+}
+
+inline std::string build_musa_error_msg(mublasStatus_t stat) {
+  std::ostringstream sout;
+  sout << "MUBLAS error(" << stat << "). ";
+  return sout.str();
+}
+
+///*************** CUSPARSE ERROR ***************/
+//inline bool is_error(cusparseStatus_t stat) {
+//  return stat != CUSPARSE_STATUS_SUCCESS;
+//}
+//
+//inline std::string build_musa_error_msg(cusparseStatus_t stat) {
+//  std::ostringstream sout;
+//  sout << "CUSparse error(" << stat << "). " << GetExternalErrorMsg(stat);
+//  return sout.str();
+//}
+
+/**************** MCCL ERROR ****************/
+#if !defined(__APPLE__) && defined(PADDLE_WITH_MCCL)
+inline bool is_error(mcclResult_t mccl_result) {
+  return mccl_result != mcclSuccess;
+}
+
+inline std::string build_musa_error_msg(mcclResult_t mccl_result) {
+  std::ostringstream sout;
+  sout << "MCCL error(" << mccl_result << "), "
+       << phi::dynload::mcclGetErrorString(mccl_result) << ". ";
+  if (errno == ENOSPC || errno == EAGAIN) {
+    std::string detail(strerror(errno));
+    detail += "\nPlease try one of the following solutions:";
+    detail += "\n1. export MCCL_SHM_DISABLE=1;";
+    detail += "\n2. export MCCL_P2P_LEVEL=SYS;";
+    detail +=
+        "\n3. Increase shared memory by setting the -shm-size "
+        "option when starting docker container, e.g., setting "
+        " -shm-size=2g.\n";
+    sout << " Detail: " + detail;
+  }
+  return sout.str();
+}
+#endif  // not(__APPLE__) and PADDLE_WITH_MCCL
+
+#define PADDLE_ENFORCE_GPU_SUCCESS(COND)                     \
+  do {                                                       \
+    auto __cond__ = (COND);                                  \
+    using __CUDA_STATUS_TYPE__ = decltype(__cond__);         \
+    constexpr auto __success_type__ =                        \
+        ::phi::enforce::details::ExternalApiType<            \
+            __CUDA_STATUS_TYPE__>::kSuccess;                 \
+    if (UNLIKELY(__cond__ != __success_type__)) {            \
+      auto __summary__ = phi::errors::External(              \
+          ::phi::enforce::build_musa_error_msg(__cond__));   \
+      __THROW_ERROR_INTERNAL__(__summary__);                 \
+    }                                                        \
+  } while (0)
+
+#define PADDLE_WARN_GPU_SUCCESS(COND)                        \
+  do {                                                       \
+    auto __cond__ = (COND);                                  \
+    using __CUDA_STATUS_TYPE__ = decltype(__cond__);         \
+    constexpr auto __success_type__ =                        \
+        ::phi::enforce::details::ExternalApiType<            \
+            __CUDA_STATUS_TYPE__>::kSuccess;                 \
+    if (UNLIKELY(__cond__ != __success_type__)) {            \
+      ::phi::enforce::ThrowWarnInternal(                     \
+          ::phi::enforce::build_musa_error_msg(__cond__));   \
+    }                                                        \
+  } while (0)
+
+#define PADDLE_ENFORCE_CUDA_LAUNCH_SUCCESS(OP)                              \
+  do {                                                                      \
+    auto res = musaGetLastError();                                          \
+    if (UNLIKELY(res != musaSuccess)) {                                     \
+      auto msg = ::phi::enforce::build_musa_error_msg(res);                 \
+      PADDLE_THROW(                                                         \
+          phi::errors::Fatal("MUSA error after kernel (%s): %s", OP, msg)); \
+    }                                                                       \
+  } while (0)
+
+inline void retry_sleep(unsigned milliseconds) {
+#ifdef _WIN32
+  Sleep(milliseconds);
+#else
+  if (milliseconds < 1000) {
+    // usleep argument must be less than 1,000,000. Reference:
+    // https://pubs.opengroup.org/onlinepubs/7908799/xsh/usleep.html
+    usleep(milliseconds * 1000);
+  } else {
+    // clip to sleep in seconds because we can not and don't have to
+    // sleep for exact milliseconds
+    sleep(milliseconds / 1000);
+  }
+#endif
+}
+
+#define PADDLE_RETRY_CUDA_SUCCESS(COND)                                 \
+  do {                                                                  \
+    auto __cond__ = (COND);                                             \
+    int retry_count = 1;                                                \
+    using __CUDA_STATUS_TYPE__ = decltype(__cond__);                    \
+    constexpr auto __success_type__ =                                   \
+        ::phi::enforce::details::ExternalApiType<                       \
+            __CUDA_STATUS_TYPE__>::kSuccess;                            \
+    while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \
+      phi::enforce::retry_sleep(10000);                                 \
+      __cond__ = (COND);                                                \
+      ++retry_count;                                                    \
+    }                                                                   \
+    if (UNLIKELY(__cond__ != __success_type__)) {                       \
+      auto __summary__ = phi::errors::External(                         \
+          ::phi::enforce::build_musa_error_msg(__cond__));              \
+      __THROW_ERROR_INTERNAL__(__summary__);                            \
+    }                                                                   \
+  } while (0)
+
+#undef DEFINE_EXTERNAL_API_TYPE
+#endif  // PADDLE_WITH_MUSA
+
 /**************************************************************************/
 /***************************** HIP ERROR **********************************/
 #ifdef PADDLE_WITH_HIP
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index e28210cfca7e4..4b21e61f8d88c 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -117,10 +117,10 @@ file(
   "xpu/*.cc" "legacy/xpu/*.cc" "selected_rows/xpu/*.cc" "fusion/xpu/*.cc"
   "sparse/xpu/*.cc")
 
-if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
-  collect_srcs(kernels_srcs SRCS ${kernel_cu})
-  kernel_declare("${kernel_cu}")
-endif()
+#if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
+#  collect_srcs(kernels_srcs SRCS ${kernel_cu})
+#  kernel_declare("${kernel_cu}")
+#endif()
 
 if(WITH_XPU)
   if(WITH_XPU_KP)
diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt
index c4bdf29e03949..f90147b013023 100644
--- a/paddle/phi/kernels/funcs/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/CMakeLists.txt
@@ -1,4 +1,4 @@
-add_subdirectory(eigen)
+#add_subdirectory(eigen)
 add_subdirectory(blas)
 add_subdirectory(lapack)
 add_subdirectory(detail)
@@ -15,4 +15,5 @@ if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
     "*.cu")
 endif()
 
-collect_srcs(kernels_srcs SRCS ${func_cc_srcs} ${func_cu_srcs})
+#collect_srcs(kernels_srcs SRCS ${func_cc_srcs} ${func_cu_srcs})
+collect_srcs(kernels_srcs SRCS ${func_cc_srcs})
diff --git a/paddle/phi/kernels/impl/warpctc_kernel_impl.h b/paddle/phi/kernels/impl/warpctc_kernel_impl.h
index 015c7a0764a2b..4b4bd6f5143dd 100644
--- a/paddle/phi/kernels/impl/warpctc_kernel_impl.h
+++ b/paddle/phi/kernels/impl/warpctc_kernel_impl.h
@@ -205,7 +205,7 @@ class WarpCTCFunctor {
     warpctc_version_ = phi::dynload::get_warpctc_version();
 
     if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       options_.loc = CTC_GPU;
       options_.stream =
           reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
diff --git a/paddle/phi/kernels/impl/warprnnt_kernel_impl.h b/paddle/phi/kernels/impl/warprnnt_kernel_impl.h
index f36ec9c007eda..bc12e17ae55fb 100644
--- a/paddle/phi/kernels/impl/warprnnt_kernel_impl.h
+++ b/paddle/phi/kernels/impl/warprnnt_kernel_impl.h
@@ -208,7 +208,7 @@ class WarpRNNTFunctor {
     options_.batch_first = true;
 
     if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       options_.loc = RNNT_GPU;
       options_.stream =
           reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();

From 50b52384069e0e1f202150bfbea79c7a0251d8a9 Mon Sep 17 00:00:00 2001
From: CaiZhi <zhi.cai@mthreads.com>
Date: Sat, 29 Jul 2023 17:03:52 +0800
Subject: [PATCH 10/55] [MTAI] feat(build): fix building error for musa backend

---
 cmake/configure.cmake                         |   1 +
 cmake/generic.cmake                           |  17 +-
 cmake/musa.cmake                              |   3 +-
 paddle/fluid/framework/data_type.h            |   1 -
 paddle/fluid/framework/data_type_transform.cc |   2 +-
 paddle/fluid/framework/details/CMakeLists.txt |  62 +++-
 paddle/fluid/framework/ir/CMakeLists.txt      |   4 +-
 paddle/fluid/framework/ir/fuse_bn_act_pass.cc |   2 +-
 .../conv_affine_channel_mkldnn_fuse_pass.cc   |   2 +
 .../framework/ir/mkldnn/cpu_quantize_pass.cc  |   3 +-
 paddle/fluid/framework/var_type_traits.cc     |   8 +
 .../fluid/imperative/gradient_accumulator.cc  |  18 +-
 .../inference/api/.resource_manager.h.swp     | Bin 0 -> 16384 bytes
 .../allocation/cuda_managed_allocator.h       |   1 +
 .../memory/allocation/pinned_allocator.cc     |   4 +-
 .../allocation/stream_safe_cuda_allocator.cc  |  15 +
 .../allocation/stream_safe_cuda_allocator.h   |   3 +
 .../memory/allocation/system_allocator.cc     |  13 +-
 paddle/fluid/operators/affine_channel_op.cu   |   4 +
 paddle/fluid/operators/batch_norm_op.cu       |   3 +
 .../fluid/operators/detection/bbox_util.cu.h  |   3 +
 .../detection/collect_fpn_proposals_op.cu     |   3 +
 .../elementwise/elementwise_op_function.h     |  18 +-
 paddle/fluid/operators/expand_as_op.cc        |   2 +-
 paddle/fluid/operators/expand_op.cc           |   2 +-
 .../fluid/operators/fused/attn_bias_add.cu.h  |   3 +
 .../fused_fc_elementwise_layernorm_op.cu      |   3 +
 .../fluid/operators/fused_token_prune_op.cu   |   3 +
 .../get_tensor_from_selected_rows_op.cc       |   2 +-
 paddle/fluid/operators/hinge_loss_op.cc       |   2 +-
 paddle/fluid/operators/im2sequence_op.cc      |   2 +-
 paddle/fluid/operators/isfinite_op.h          |   8 +-
 paddle/fluid/operators/l1_norm_op.cc          |   2 +-
 paddle/fluid/operators/load_op.cc             |   2 +-
 paddle/fluid/operators/math/inclusive_scan.h  |   3 +
 paddle/fluid/operators/matmul_op.cc           |  14 +-
 paddle/fluid/operators/minus_op.cc            |   2 +-
 paddle/fluid/operators/mudnn_rnn_cache.h      |  33 ++
 paddle/fluid/operators/nop_op.cc              |   2 +-
 .../operators/optimizers/sparse_momentum_op.h |   5 +-
 .../fluid/operators/pad_constant_like_op.cc   |   2 +-
 paddle/fluid/operators/prroi_pool_op.h        |   8 +-
 paddle/fluid/operators/rank_loss_op.cc        |   2 +-
 paddle/fluid/operators/reshape_op.cc          |   2 +-
 paddle/fluid/operators/save_op.cc             |   2 +-
 .../sequence_ops/sequence_softmax_op.cu       |   4 +
 paddle/fluid/operators/svd_helper.h           |   2 +-
 .../fluid/operators/sync_batch_norm_utils.h   |   3 +
 paddle/fluid/operators/top_k_op.cu            |   3 +
 paddle/fluid/operators/uniform_random_op.h    |   4 +-
 paddle/fluid/platform/CMakeLists.txt          |  30 +-
 paddle/fluid/platform/device/CMakeLists.txt   |   2 +-
 .../fluid/platform/device/gpu/CMakeLists.txt  |   6 +
 paddle/fluid/platform/device/gpu/gpu_dnn.h    |   2 +-
 .../platform/device/gpu/gpu_resource_pool.cc  |   2 +-
 paddle/fluid/platform/device/gpu/gpu_types.h  |   5 +-
 paddle/fluid/platform/dynload/CMakeLists.txt  |  12 +
 paddle/fluid/platform/profiler.cc             |   2 +-
 .../fluid/platform/stream_callback_manager.cc |  15 +-
 paddle/phi/CMakeLists.txt                     |   9 +
 paddle/phi/backends/device_code.cc            | 140 ++++----
 paddle/phi/backends/dynload/CMakeLists.txt    |  12 +
 paddle/phi/backends/gpu/gpu_decls.h           |   1 +
 paddle/phi/backends/gpu/gpu_primitives.h      | 308 +++++++++---------
 .../phi/backends/gpu/musa/.musa_info.cc.swp   | Bin 0 -> 4096 bytes
 .../backends/gpu/musa/musa_device_function.h  | 190 +++++++++++
 paddle/phi/backends/gpu/musa/musa_helper.h    |  34 ++
 paddle/phi/common/.float16.h.swp              | Bin 0 -> 16384 bytes
 paddle/phi/common/bfloat16.h                  |  19 +-
 paddle/phi/common/complex.h                   |  36 +-
 paddle/phi/common/cpstring_impl.h             |   6 +-
 paddle/phi/common/float16.h                   |  31 +-
 paddle/phi/common/scalar.h                    |  16 +-
 paddle/phi/common/transform.h                 |  17 +-
 paddle/phi/core/enforce.h                     |  11 +
 paddle/phi/core/hostdevice.h                  |   6 +-
 paddle/phi/core/macros.h                      |   2 +-
 paddle/phi/core/visit_type.h                  |   8 -
 paddle/phi/kernels/CMakeLists.txt             |  24 +-
 paddle/phi/kernels/activation_kernel.cc       |   2 +-
 paddle/phi/kernels/assign_kernel.cc           |   2 +-
 paddle/phi/kernels/batch_norm_kernel.cc       |   2 +-
 .../kernels/check_memory_continue_kernel.cc   |   2 +-
 paddle/phi/kernels/coalesce_tensor_kernel.cc  |   2 +-
 .../phi/kernels/cpu/activation_grad_kernel.cc |   2 +-
 paddle/phi/kernels/cpu/activation_kernel.cc   |  32 +-
 paddle/phi/kernels/cpu/cast_grad_kernel.cc    |   6 +-
 paddle/phi/kernels/dist_grad_kernel.cc        |   2 +-
 paddle/phi/kernels/empty_kernel.cc            |   2 +-
 paddle/phi/kernels/flatten_grad_kernel.cc     |   2 +-
 paddle/phi/kernels/flatten_kernel.cc          |   2 +-
 paddle/phi/kernels/full_kernel.cc             |   2 +-
 paddle/phi/kernels/funcs/.im2col.cu.swp       | Bin 0 -> 16384 bytes
 paddle/phi/kernels/funcs/CMakeLists.txt       |   2 +-
 paddle/phi/kernels/funcs/activation_functor.h |   2 +-
 paddle/phi/kernels/funcs/algorithm.h          |   4 +-
 paddle/phi/kernels/funcs/broadcast_function.h |   4 +-
 .../kernels/funcs/concat_and_split_functor.cu |   4 +
 paddle/phi/kernels/funcs/cross_entropy.cu     |   4 +-
 paddle/phi/kernels/funcs/diagonal.h           |   6 +-
 .../phi/kernels/funcs/distribution_helper.h   |  72 +++-
 .../phi/kernels/funcs/eigen/.extensions.h.swp | Bin 0 -> 16384 bytes
 paddle/phi/kernels/funcs/eigen/.slice.cu.swp  | Bin 0 -> 12288 bytes
 paddle/phi/kernels/funcs/eigen/erf.cc         |   6 +-
 paddle/phi/kernels/funcs/eigen/extensions.h   |   3 +-
 paddle/phi/kernels/funcs/eigen/pad.cu         |   4 +-
 paddle/phi/kernels/funcs/eigen/slice.cu       |   4 +-
 paddle/phi/kernels/funcs/elementwise_base.h   |   6 +-
 .../phi/kernels/funcs/elementwise_functor.h   |   2 +-
 .../phi/kernels/funcs/elementwise_grad_base.h |   4 +-
 paddle/phi/kernels/funcs/fft.cu               |   3 +-
 paddle/phi/kernels/funcs/fft_fill_conj.h      |   4 +-
 paddle/phi/kernels/funcs/for_range.h          |   2 +-
 .../kernels/funcs/gather_scatter_functor.cu   |   2 +-
 .../kernels/funcs/gather_scatter_functor.h    |   2 -
 paddle/phi/kernels/funcs/im2col.cu            |  14 +-
 paddle/phi/kernels/funcs/inclusive_scan.h     |   3 +
 paddle/phi/kernels/funcs/index_calculator.h   |   2 +-
 paddle/phi/kernels/funcs/index_put_utils.h    |   7 +-
 .../phi/kernels/funcs/interpolate_function.h  |   4 +-
 paddle/phi/kernels/funcs/isfinite_functor.h   |   6 +-
 paddle/phi/kernels/funcs/layer_norm_impl.cu.h |   3 +
 paddle/phi/kernels/funcs/math_function.cc     |   2 +-
 paddle/phi/kernels/funcs/mode.h               |   4 +-
 paddle/phi/kernels/funcs/mufft_util.h         |   0
 paddle/phi/kernels/funcs/norm_utils.cu.h      |   3 +
 paddle/phi/kernels/funcs/random.cuh           |   3 +
 paddle/phi/kernels/funcs/reduce_function.h    |   8 +-
 paddle/phi/kernels/funcs/segment_pooling.cu   |  19 +-
 paddle/phi/kernels/funcs/select_impl.cu.h     |   3 +
 paddle/phi/kernels/funcs/softmax.cu           |   8 +-
 paddle/phi/kernels/funcs/squared_l2_norm.h    |   6 +-
 .../phi/kernels/funcs/top_k_function_cuda.h   |   3 +
 .../gpu/fused_bn_activation_grad_kernel.cu    |   4 +
 .../fusion/gpu/fused_bn_activation_kernel.cu  |   4 +
 paddle/phi/kernels/gpu/.auc_kernel.cu.swp     | Bin 0 -> 4096 bytes
 paddle/phi/kernels/gpu/arg_min_max_kernel.cu  |   5 +-
 paddle/phi/kernels/gpu/argsort_grad_kernel.cu |   3 +
 paddle/phi/kernels/gpu/argsort_kernel.cu      |   3 +
 paddle/phi/kernels/gpu/auc_kernel.cu          |  17 +
 .../phi/kernels/gpu/batch_norm_grad_kernel.cu |  18 +
 paddle/phi/kernels/gpu/batch_norm_kernel.cu   | 241 +++++++-------
 paddle/phi/kernels/gpu/bernoulli_kernel.cu    |   3 +
 .../phi/kernels/gpu/check_numerics_kernel.cu  |   6 +
 paddle/phi/kernels/gpu/cholesky_kernel.cu     |   4 +-
 .../phi/kernels/gpu/cholesky_solve_kernel.cu  |   4 +-
 .../kernels/gpu/cross_entropy_grad_kernel.cu  |  13 +-
 .../phi/kernels/gpu/cross_entropy_kernel.cu   |  10 +
 .../phi/kernels/gpu/cudnn_lstm_grad_kernel.cu |   4 +-
 paddle/phi/kernels/gpu/cudnn_lstm_kernel.cu   |   3 +-
 paddle/phi/kernels/gpu/cum_grad_kernel.cu     |   3 +
 paddle/phi/kernels/gpu/cum_kernel.cu          |   5 +
 paddle/phi/kernels/gpu/cumprod_grad_kernel.cu |   2 +
 paddle/phi/kernels/gpu/decode_jpeg_kernel.cu  |   2 +-
 paddle/phi/kernels/gpu/depthwise_conv.h       |   3 +
 .../gpu/distribute_fpn_proposals_kernel.cu    |   3 +
 .../kernels/gpu/generate_proposals_kernel.cu  |   3 +
 .../phi/kernels/gpu/graph_reindex_kernel.cu   |   3 +
 paddle/phi/kernels/gpu/group_norm_utils.h     |   3 +
 .../phi/kernels/gpu/gumbel_softmax_kernel.cu  |   5 +-
 .../kernels/gpu/instance_norm_grad_kernel.cu  |  17 +-
 .../phi/kernels/gpu/instance_norm_kernel.cu   |  38 +--
 paddle/phi/kernels/gpu/instance_norm_utils.h  |   3 +
 paddle/phi/kernels/gpu/mudnn_lstm_cache.h     |   0
 paddle/phi/kernels/gpu/multinomial_kernel.cu  |   3 +
 paddle/phi/kernels/gpu/nonzero_kernel.cu      |   3 +
 paddle/phi/kernels/gpu/norm_grad_kernel.cu    |   3 +
 paddle/phi/kernels/gpu/norm_kernel.cu         |   3 +
 paddle/phi/kernels/gpu/poisson_kernel.cu      |   3 +
 paddle/phi/kernels/gpu/randperm_kernel.cu     |   9 +
 .../gpu/sigmoid_cross_entropy_with_logits.h   |   3 +
 .../phi/kernels/gpu/viterbi_decode_kernel.cu  |   3 +
 .../phi/kernels/impl/clip_grad_kernel_impl.h  |   4 +-
 paddle/phi/kernels/impl/clip_kernel_impl.h    |   4 +-
 paddle/phi/kernels/impl/complex_kernel_impl.h |   2 +-
 paddle/phi/kernels/impl/diag_embed_impl.h     |   4 +-
 .../phi/kernels/impl/dot_grad_kernel_impl.h   |  12 +-
 .../impl/elementwise_grad_kernel_impl.h       |   2 +-
 .../kernels/impl/elementwise_kernel_impl.h    |   2 +-
 .../phi/kernels/impl/fft_grad_kernel_impl.h   |   2 +-
 paddle/phi/kernels/impl/isclose_kernel_impl.h |   2 +-
 .../phi/kernels/impl/kron_grad_kernel_impl.h  |   4 +-
 paddle/phi/kernels/impl/kron_kernel_impl.h    |   4 +-
 .../kernels/impl/matmul_grad_kernel_impl.h    |   4 +-
 .../phi/kernels/impl/polygamma_kernel_impl.h  |   4 +-
 paddle/phi/kernels/impl/renorm_impl.h         |   6 +-
 .../impl/repeat_interleave_grad_kernel_impl.h |  10 +-
 .../impl/repeat_interleave_kernel_impl.h      |   8 +-
 .../kernels/impl/sequence_mask_kernel_impl.h  |   4 +-
 .../phi/kernels/impl/solve_grad_kernel_impl.h |   4 +-
 .../phi/kernels/impl/trace_grad_kernel_impl.h |   4 +-
 .../kernels/impl/unstack_grad_kernel_impl.h   |   4 +-
 paddle/phi/kernels/impl/unstack_kernel_impl.h |   6 +-
 paddle/phi/kernels/is_empty_kernel.cc         |   2 +-
 paddle/phi/kernels/memcpy_kernel.cc           |   2 +-
 paddle/phi/kernels/npu_identity_kernel.cc     |   2 +-
 .../kernels/primitive/compute_primitives.h    |   3 +
 paddle/phi/kernels/prod_kernel.cc             |   2 +-
 paddle/phi/kernels/reduce_all_kernel.cc       |   2 +-
 paddle/phi/kernels/reduce_amax_kernel.cc      |   2 +-
 paddle/phi/kernels/reduce_amin_kernel.cc      |   2 +-
 paddle/phi/kernels/reduce_any_kernel.cc       |   2 +-
 paddle/phi/kernels/reduce_mean_kernel.cc      |   2 +-
 paddle/phi/kernels/reduce_min_kernel.cc       |   4 +-
 paddle/phi/kernels/reduce_sum_kernel.cc       |   2 +-
 paddle/phi/kernels/reverse_kernel.cc          |   2 +-
 .../selected_rows/activation_kernel.cc        |   2 +-
 .../kernels/selected_rows/assign_kernel.cc    |   2 +-
 .../elementwise_multiply_kernel.cc            |   2 +-
 .../phi/kernels/selected_rows/full_kernel.cc  |   2 +-
 .../kernels/selected_rows/isfinite_kernel.cc  |   2 +-
 .../merge_selected_rows_kernel.cc             |   2 +-
 .../phi/kernels/selected_rows/scale_kernel.cc |   2 +-
 .../phi/kernels/selected_rows/shape_kernel.cc |   2 +-
 .../kernels/selected_rows/uniform_kernel.cc   |   2 +-
 paddle/phi/kernels/shape_kernel.cc            |   2 +-
 .../kernels/sparse/batch_norm_grad_kernel.cc  |   2 +-
 .../phi/kernels/sparse/batch_norm_kernel.cc   |   2 +-
 paddle/phi/kernels/sparse/empty_kernel.cc     |   2 +-
 paddle/phi/kernels/sparse/gpu/conv.cu.h       |   3 +
 .../sparse/sparse_utils_grad_kernel.cc        |   2 +-
 paddle/phi/kernels/squeeze_grad_kernel.cc     |   2 +-
 paddle/phi/kernels/squeeze_kernel.cc          |   2 +-
 .../phi/kernels/strided_slice_grad_kernel.cc  |   2 +-
 paddle/phi/kernels/strided_slice_kernel.cc    |   2 +-
 paddle/phi/kernels/strings/case_utils.h       |   2 +-
 .../strings/gpu/strings_lower_upper_kernel.cu |   3 +-
 paddle/phi/kernels/transfer_layout_kernel.cc  |   2 +-
 paddle/phi/kernels/unsqueeze_grad_kernel.cc   |   2 +-
 paddle/phi/kernels/unsqueeze_kernel.cc        |   2 +-
 .../fluid/test_leaky_relu_grad_grad_functor.h |   4 +-
 .../test_strings_lower_upper_dev_api.cu       |   2 +-
 test/custom_op/custom_raw_op_kernel_op.h      |   2 +-
 233 files changed, 1492 insertions(+), 709 deletions(-)
 create mode 100644 paddle/fluid/inference/api/.resource_manager.h.swp
 create mode 100644 paddle/fluid/operators/mudnn_rnn_cache.h
 create mode 100644 paddle/phi/backends/gpu/musa/.musa_info.cc.swp
 create mode 100644 paddle/phi/backends/gpu/musa/musa_device_function.h
 create mode 100644 paddle/phi/common/.float16.h.swp
 create mode 100644 paddle/phi/kernels/funcs/.im2col.cu.swp
 create mode 100644 paddle/phi/kernels/funcs/eigen/.extensions.h.swp
 create mode 100644 paddle/phi/kernels/funcs/eigen/.slice.cu.swp
 create mode 100644 paddle/phi/kernels/funcs/mufft_util.h
 create mode 100644 paddle/phi/kernels/gpu/.auc_kernel.cu.swp
 create mode 100644 paddle/phi/kernels/gpu/mudnn_lstm_cache.h

diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 7a9e3ebdd5fde..c1db56de7f728 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -177,6 +177,7 @@ elseif(WITH_ROCM)
   endif()
 elseif(WITH_MUSA)
   add_definitions(-DPADDLE_WITH_MUSA)
+  #add_definitions(-DEIGEN_USE_THREADS)
   add_definitions(-DEIGEN_USE_GPU)
   add_definitions(-DEIGEN_USE_MUSA)
   list(APPEND DEPENDENT_INCLUDE_DIRS "/usr/local/musa/include/")
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 28aecb580a637..4a255c0902206 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -787,14 +787,13 @@ function(musa_library TARGET_NAME)
                           "${multiValueArgs}" ${ARGN})
     if(musa_library_SRCS)
       # TODO(MTAI): enable compiling static library
-      #if(musa_library_SHARED OR musa_library_shared) # build *.so
-      #  musa_add_library(${TARGET_NAME} SHARED ${musa_library_SRCS})
-      #else()
-      #  musa_add_library(${TARGET_NAME} STATIC ${musa_library_SRCS})
-      #  find_fluid_modules(${TARGET_NAME})
-      #  find_phi_modules(${TARGET_NAME})
-      #endif()
-      musa_add_library(${TARGET_NAME} SHARED ${hip_library_SRCS})
+      if(musa_library_SHARED OR musa_library_shared) # build *.so
+        add_library(${TARGET_NAME} SHARED ${musa_library_SRCS})
+      else()
+        add_library(${TARGET_NAME} STATIC ${musa_library_SRCS})
+        find_fluid_modules(${TARGET_NAME})
+        find_phi_modules(${TARGET_NAME})
+      endif()
       if(musa_library_DEPS)
         add_dependencies(${TARGET_NAME} ${musa_library_DEPS})
         target_link_libraries(${TARGET_NAME} ${musa_library_DEPS})
@@ -830,7 +829,7 @@ function(musa_binary TARGET_NAME)
     set(multiValueArgs SRCS DEPS)
     cmake_parse_arguments(musa_binary "${options}" "${oneValueArgs}"
                           "${multiValueArgs}" ${ARGN})
-    musa_add_executable(${TARGET_NAME} ${musa_binary_SRCS})
+    add_executable(${TARGET_NAME} ${musa_binary_SRCS})
     if(musa_binary_DEPS)
       target_link_libraries(${TARGET_NAME} ${musa_binary_DEPS})
       add_dependencies(${TARGET_NAME} ${musa_binary_DEPS})
diff --git a/cmake/musa.cmake b/cmake/musa.cmake
index 39245d726d4f9..c6701f33858f8 100644
--- a/cmake/musa.cmake
+++ b/cmake/musa.cmake
@@ -26,7 +26,8 @@ else()
   list(APPEND MUSA_MCC_FLAGS -std=c++17)
 endif()
 
-set(MUSA_VERBOSE_BUILD ON)
+list(APPEND MUSA_MCC_FLAGS -U__CUDA__)
+#set(MUSA_VERBOSE_BUILD ON)
 if(CMAKE_BUILD_TYPE MATCHES Debug)
   list(APPEND MUSA_MCC_FLAGS -g2)
   list(APPEND MUSA_MCC_FLAGS -O0)
diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h
index 7e002c8154147..672bac7d329ff 100644
--- a/paddle/fluid/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
@@ -130,7 +130,6 @@ inline void VisitDataType(proto::VarType::Type type, Visitor visitor) {
 #define VisitDataTypeCallback(cpp_type, proto_type) \
   do {                                              \
     if (type == proto_type) {                       \
-      visitor.template apply<cpp_type>();           \
       return;                                       \
     }                                               \
   } while (0)
diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc
index 9d114fcf56396..b2fb089f53574 100644
--- a/paddle/fluid/framework/data_type_transform.cc
+++ b/paddle/fluid/framework/data_type_transform.cc
@@ -101,7 +101,7 @@ struct CastDataType {
             in_end,
             out_begin,
             CastDataTypeFunctor<InType, OutType>());
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
     } else if (platform::is_gpu_place(in_.place())) {
       phi::Transform<phi::GPUContext> trans;
       auto* context = static_cast<const phi::GPUContext*>(ctx_);
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 4d9a88cf22372..88c58c24b804f 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -202,6 +202,66 @@ elseif(WITH_ROCM)
     fused_broadcast_op_handle
     SRCS fused_broadcast_op_handle.cc
     DEPS broadcast_op_handle)
+elseif(WITH_MUSA)
+  musa_library(
+    nan_inf_utils
+    SRCS nan_inf_utils_detail.cc
+    DEPS framework_proto scope place phi)
+  musa_library(
+    all_reduce_op_handle
+    SRCS all_reduce_op_handle.cc
+    DEPS op_handle_base
+         scope
+         lod_tensor
+         phi
+         memory
+         dynload_cuda
+         variable_visitor)
+  musa_library(
+    fused_all_reduce_op_handle
+    SRCS fused_all_reduce_op_handle.cc
+    DEPS all_reduce_op_handle
+         op_handle_base
+         variable_visitor
+         scope
+         lod_tensor
+         phi
+         memory
+         dynload_cuda
+         place)
+  musa_library(
+    grad_merge_all_reduce_op_handle
+    SRCS grad_merge_all_reduce_op_handle.cc
+    DEPS fused_all_reduce_op_handle
+         op_handle_base
+         scope
+         lod_tensor
+         phi
+         memory
+         dynload_cuda
+         variable_visitor
+         place
+         all_reduce_op_handle)
+
+  if(WITH_DISTRIBUTE)
+    musa_library(
+      reduce_op_handle
+      SRCS reduce_op_handle.cc
+      DEPS op_handle_base variable_visitor scope phi dynload_cuda)
+  else()
+    musa_library(
+      reduce_op_handle
+      SRCS reduce_op_handle.cc
+      DEPS op_handle_base variable_visitor scope phi dynload_cuda)
+  endif()
+  musa_library(
+    broadcast_op_handle
+    SRCS broadcast_op_handle.cc
+    DEPS op_handle_base scope phi memory variable_visitor dynload_cuda)
+  musa_library(
+    fused_broadcast_op_handle
+    SRCS fused_broadcast_op_handle.cc
+    DEPS broadcast_op_handle)
 else()
   cc_library(
     nan_inf_utils
@@ -386,7 +446,7 @@ endif()
 
 if(NOT APPLE
    AND NOT WIN32
-   AND (WITH_GPU OR WITH_ROCM))
+   AND (WITH_GPU OR WITH_ROCM OR WITH_MUSA))
   set(IR_PASS_DEPS ${IR_PASS_DEPS} fusion_group_pass)
 endif()
 cc_library(
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 1c186373cdbb5..6f1075c3bf16d 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -3,7 +3,7 @@ add_subdirectory(memory_optimize_pass)
 add_subdirectory(multi_devices_graph_pass)
 if(NOT APPLE
    AND NOT WIN32
-   AND (WITH_GPU OR WITH_ROCM))
+   AND (WITH_GPU OR WITH_ROCM OR WITH_MUSA))
   add_subdirectory(fusion_group)
 endif()
 
@@ -159,7 +159,7 @@ if(WITH_TENSORRT)
   pass_library(split_layernorm_to_math_ops_pass inference)
 endif()
 
-if(WITH_GPU OR WITH_ROCM)
+if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
   pass_library(cudnn_placement_pass base DEPS placement_pass_base)
   pass_library(embedding_eltwise_layernorm_fuse_pass inference)
 endif()
diff --git a/paddle/fluid/framework/ir/fuse_bn_act_pass.cc b/paddle/fluid/framework/ir/fuse_bn_act_pass.cc
index 322fcb0f7cf48..87cc35b2c3b5f 100644
--- a/paddle/fluid/framework/ir/fuse_bn_act_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_bn_act_pass.cc
@@ -35,7 +35,7 @@ namespace ir {
 
 void FuseBatchNormActPass::ApplyImpl(ir::Graph *graph) const {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
-#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 4, 1)
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || CUDNN_VERSION_MIN(7, 4, 1)
   // forward
   std::unordered_set<std::string> act_types = {"relu"};
   graph = FuseBatchNormAct(graph, act_types);
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_affine_channel_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_affine_channel_mkldnn_fuse_pass.cc
index 9639d3f374bef..8180c6c02f651 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_affine_channel_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_affine_channel_mkldnn_fuse_pass.cc
@@ -54,6 +54,7 @@ void recompute_bias_and_weights(const Scope* scope,
                                 const ir::Node& ac_scale,
                                 const phi::DenseTensor& ac_bias_tensor,
                                 phi::DenseTensor* eltwise_y_in_tensor) {
+#if 0
   using EigenVectorArrayMap =
       Eigen::Map<Eigen::Array<float, Eigen::Dynamic, 1>>;
   using ConstEigenVectorArrayMap =
@@ -102,6 +103,7 @@ void recompute_bias_and_weights(const Scope* scope,
   for (int i = 0; i < weights->numel(); ++i) {
     if (std::fpclassify(weights_data[i]) == FP_SUBNORMAL) weights_data[i] = 0;
   }
+#endif
 }
 
 ConvAffineChannelFusePass::ConvAffineChannelFusePass() {
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
index 7376e87155187..010a8aabdf1eb 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
@@ -11,7 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
+#if 0
 #include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h"
 
 #include <sstream>
@@ -1328,3 +1328,4 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const {
 
 REGISTER_PASS(cpu_quantize_pass, paddle::framework::ir::CPUQuantizePass)
     .RequirePassAttr("quant_var_scales");
+#endif
diff --git a/paddle/fluid/framework/var_type_traits.cc b/paddle/fluid/framework/var_type_traits.cc
index d73c9b7d95957..132da0d177178 100644
--- a/paddle/fluid/framework/var_type_traits.cc
+++ b/paddle/fluid/framework/var_type_traits.cc
@@ -37,6 +37,14 @@
 #include "paddle/fluid/operators/miopen_rnn_cache.h"
 #endif
 
+#ifdef PADDLE_WITH_MUSA
+#if defined(PADDLE_WITH_MCCL)
+#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"   // NOLINT
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"  // NOLINT
+#endif
+#include "paddle/fluid/operators/mudnn_rnn_cache.h"
+#endif
+
 #if defined(PADDLE_WITH_XPU_BKCL)
 #include "paddle/fluid/platform/device/xpu/bkcl_helper.h"
 #endif
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index 8c78f7af783dd..8cc764be9ff39 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -204,7 +204,7 @@ void TensorAdd(const VarType& src, VarType* dst) {
   }
 
   if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
     PADDLE_TENSOR_ADD(float, phi::GPUContext);
     PADDLE_TENSOR_ADD(double, phi::GPUContext);
     PADDLE_TENSOR_ADD(phi::dtype::float16, phi::GPUContext);
@@ -313,7 +313,7 @@ void SelectedRowsAddToTensor(const VarType& src, VarType* dst) {
     return;                                                              \
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
   if (paddle::platform::is_gpu_place(place)) {
     PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::GPUContext, float);
     PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::GPUContext, double);
@@ -321,7 +321,7 @@ void SelectedRowsAddToTensor(const VarType& src, VarType* dst) {
 #endif
     PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::CPUContext, float);
     PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::CPUContext, double);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
   }
 #endif
 
@@ -364,7 +364,7 @@ void SelectedRowsAddTensor(const VarType& src_selected_rows_var,
     return;                                                            \
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
   if (platform::is_gpu_place(place)) {
     PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::GPUContext, float);
     PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::GPUContext, double);
@@ -372,7 +372,7 @@ void SelectedRowsAddTensor(const VarType& src_selected_rows_var,
 #endif
     PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::CPUContext, float);
     PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::CPUContext, double);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
   }
 #endif
 
@@ -425,7 +425,7 @@ std::shared_ptr<ReturnVarType> SelectedRowsMerge(const VarType& src1,
     return dst_var;                                                  \
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
   if (paddle::platform::is_gpu_place(place)) {
     PADDLE_SELECTED_ROWS_ADD(phi::GPUContext, float);
     PADDLE_SELECTED_ROWS_ADD(phi::GPUContext, double);
@@ -441,7 +441,7 @@ std::shared_ptr<ReturnVarType> SelectedRowsMerge(const VarType& src1,
 #if defined(PADDLE_WITH_XPU)
     }
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
   }
 #endif
 
@@ -712,7 +712,7 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr<VariableWrapper> var,
         }
       }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
       if (paddle::platform::is_gpu_place(place)) {
         // sum selected rows firstly
         for (auto& var_info : tmp_grad_vars_) {
@@ -778,7 +778,7 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr<VariableWrapper> var,
           // Increase count
           IncreaseCurCnt();
         }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
       }
 #endif
       tmp_grad_vars_.clear();
diff --git a/paddle/fluid/inference/api/.resource_manager.h.swp b/paddle/fluid/inference/api/.resource_manager.h.swp
new file mode 100644
index 0000000000000000000000000000000000000000..b4cebabe4d0f8260883b5fa421aa42bae251d8e0
GIT binary patch
literal 16384
zcmeI3Uu+yl9mlr;TIe4Nt&l1ZXzD<*Epr<?YC<pWNtZjvx#EBF+4&=omc8A%yHWOE
zW@lrYG)+bNL=h4KB&1g20rd$gRs2z^3Mdc!5vWKI1TAlrH{b!4KY5@i;ybguce{6M
zJ4Hl7Xf1twzMbFvcIG?3ot^pZx^w7MYngn!GOOX|J(~8}PrYpav~lRb55A&l-1mh%
zyx-64BS()sa<qGN);?nHWvlmnm+FCO+YU9{C^VD5^{x{!TW4OEa_U)BHv^_~8u}5p
zU}$<~kMc@?FV|&RsX(c~z6#u<9iE##B+U*^JV@@pZ*$)or9r7csX(bfsX(bfsX(bf
zsX(bfslfk_0wOq|T}F2fBwg~8&)Y^m-<Eu@CZEAbdGfXVC>1CbC>1CbC>1CbC>1Cb
zC>1CbC>1CbC>1Cb_#afjwBts-CO7<e{-5suU%pq<z6W-|!{APE2l&mqHSHRB8k_+C
zyhqc%4ZaAT0$0F!@JX-)s$dFCf*W^h+ONTL;7M=_Y=RqiY1&V~R{;Y@fDS6)1K@V>
zr#m(6SKyc6yWpGP%fJMm2B*M0I0k0H^>=C7-@sqNGvG<E1!lnm;9u|5v^T*I!4)tM
z-nc{4UH~qb0e`+7=LJ6l&w<Ne6?_D|`3_C{Id}#*U;#V`ZoFO7eh0n-J`c`<GhhWw
zg1f+9kW9G-UIJeOS3n;;26XUVa2*Mm*TAdb7vM#36+8=`0xmcW=D~-+KQMlO2WlKK
zl!SDA-y!~%M{Oe%oSLq2QSgXfIwlK4VOOgr=jP5l^TZ4}TqCZzZS**^4V$?}$UZ}j
zV=zRiDiM9SutOuD+~_#Ib#B*EHy9R0j$lT>eT#<S;Hb^n1$Y4=^S!}g9(jVfbnLk0
zyFnzVVVQy1VGa{JsDhRSeO9fmIi^KPAb(3o0bHAcB1uKa1)tfZNzr$IryZC)q$f=e
zZ=Os}$$qKQmdC^{MQKIfK|6~p78fNIk2_|#Bnm6;Qk1DURxK%Odfr%(>_dFqu}Ay1
z7(@%KN4+M!z^sD8HSS|P1*2g`l;m8CYn$uID>#`#dA4kclyMr-B;!l!njL_G7+a(Q
zZCW<B7ZGSp=C%=LlT*a<y-=jlHn7=<LvD@o<1*lG!frVzGFsbQmT`#A$}sdfI$Lzc
zrV7hbaa0zYtj8g`r%fXc*Tx`Pv#!)=RIAV+9+GJi_ck|5!^Wu)4>PfA2w5Q?vZLmi
zuuRO=k>((Dh7~%QAdPMFv6;7wLvLueapLe?8jkx|QV+T}d_K4Y_qHGRYNwGOpFVRT
z3_bONs2xvqu}bRiq58P!RYZ0p&k~q;=EnwOa;i2y9u0)H@OgX<<Q)_YPvRmwLi;+%
z4Or?RH)i{}$d6;<B0H2f;~*X0Om#WJ8zW=)pSt?*xQQFNUmG&flJsT}Fqx|Pdx*M|
z45NjRCos=7d{5AAk)H_ZcHl6JA;`uJ)J&GSH9ZvHn`YGgCO1(=9XP3K7B0cCz!SNs
z9b{c%(=Njud~&VcY%VPrr&}8*jmBoPK1CjXys+?OYi-xE<;`||O4G0cNG^rQW9K7k
z1cIj*Lz8w*OqTKNR3lap8C2du3|rko3h%M<p3oS%TV(G8$#s@<ttLKH+GXNHNs;tH
zPPB0&h^Ad4MAwODx)c*9)&^L>Q0|#dqO>|8=HGaxi2Hd=8#jlm;lcQoRq6d8^=#I~
z(0G;;*_2GgMLMPey<=MEP#o%*h$Ke(3Ts&DPuye`n_A9Ins<G^WpcaNI)0-T>mmU&
zSC2<;&@izR(4^k239+F*krzfm;B!H3U1_g3&YYQ@)#GVp7g@06bA3-&%ewuwLy+ow
ziXjI_zEYeFnl98_ROom_H`lbduap{&-@J>}=yol8fVw{4$sI9+?AP>`V>;QRxUIO%
zsv)QXZuVT0_;~kOL-0DxWx|yAsE|aSO*s*LN|vyM^FmsY7Hvwi#|XNwzJ~#|SeIGE
zF}+@d*QQC&$NJS{UXNg<=dw^PxkG}5uyiA*lhw@)a=O00USHX0Ewst%I%%w~G+P_3
z)fId@LFy}K$tPMX%^5-&n!yRCKLmD2a7yN4CZx7%lFhIRWF2T@k2jjIkYo*f4^1Ew
z^)_j>C&=-7yVahN4chb^>6+8?5#w;w*&${iXIr!5P;_cbp4IGeiZbEj+9q3^3G{A;
zgnn1xdXNH2Px7ZPL{QcBt*xyJR%2G5R(#&m;Z5oo>PxN0!b*GLV{?^5Nu{%Xgw4(l
z@jJqVZxd^a0N=VftxX~Q?<A(4FGv*1(Lfiw46UTw77c`Ko7}lL%*3dYEohhZq9jX3
zG^<x#wKE;-8ImkD3ED^|WfV*#l_s#-j(gfb8#(Ls1(~#_$lRg1IkKi+v5$X~dL;T-
z^j1jSamczf3<+*%lwXh|C-?t8_P!zZ#B%?iKHvW%_WIuf=Rp7-0}p`LvFE=Ez65OW
zF!&&NKe!*<2V_p*4R8&71N1-(EPy7M12QLYFL)jM|KEe(g71UN;0f?i@IG)icm@0a
zuLGGISOB*H4P3`Q|5flJxC)*JGACew4}(9TPp^O<gCBvHz!$*N;B!FsnS=7DRG?I#
zRG?I#RG?I#RNxjXFuFsHw~)!cyxD5kk7K9bSY117)K{8Dz1?mtu1uyWh~@adA@jM`
zWLK&dy(G8uRM~ZvwQ|j1KF-$a({a9bTF0tcJ_pF80mf4rwVIY>3-<63Z)N18gjx-&
z_pV|2NR_ISW36l^31TdWIkpagk3BRtBv`<sgmR>x<r*6$>K{HUgY(IKmg<h$aqW_O
zu#le%>Evi4I!*uYQk>0Cs7C=v38WIYu~@hIae^T6>pST~lr)p$(XHgMih7(NACs)p
zZbNoE-<Dy1a$MUu{hLXh<a#lzsajZ&m(Q7}Pc^2=Xp;UdC#6&h1<jTF1T`ku?B1G0
z!FVESJgXC@q0;a$E_#vUAZ0a7W2x{^xvIiI8TXiGpVIDS(|$_p!jx8$d{W}T=TIC3
z>?J@D!Xn}xnI5z?`QK$ZG_t4I(_4dT$(1}k2>kRvoynsF(KTupC+&(nJcI#<Z)`Ks
zkBc}}+O6aaIQpo}70SdOyyTN;Bet0wm(mwkFH)zceSS`5)8h0iPEnBhCa}OmHlwee
zvgL;<HqRxxv0;~w)(Xut`CEd&3vKc_Zz8`sVpq#FxvlIB(LFq8#91KBRJ^#&(&I=q
zl{b1THO94ocB6A{nxU*#i;|Ny<XvO}HlDMG^Lb{{R@E|3m1=6zjH?avw(?*+so@#Q
Hz}vq8%#5C7

literal 0
HcmV?d00001

diff --git a/paddle/fluid/memory/allocation/cuda_managed_allocator.h b/paddle/fluid/memory/allocation/cuda_managed_allocator.h
index a01e1c58d439b..3fdcfb8038086 100644
--- a/paddle/fluid/memory/allocation/cuda_managed_allocator.h
+++ b/paddle/fluid/memory/allocation/cuda_managed_allocator.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+#include <mutex>
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/platform/place.h"
 
diff --git a/paddle/fluid/memory/allocation/pinned_allocator.cc b/paddle/fluid/memory/allocation/pinned_allocator.cc
index 4737e5c565b45..567ec4e4c9461 100644
--- a/paddle/fluid/memory/allocation/pinned_allocator.cc
+++ b/paddle/fluid/memory/allocation/pinned_allocator.cc
@@ -24,7 +24,7 @@ void CPUPinnedAllocator::FreeImpl(phi::Allocation *allocation) {
 #ifdef PADDLE_WITH_HIP
   PADDLE_ENFORCE_GPU_SUCCESS(hipHostFree(allocation->ptr()));
 #elif defined(PADDLE_WITH_MUSA)
-  PADDLE_ENFORCE_GPU_SUCCESS(musaHostFree(allocation->ptr()));
+  PADDLE_ENFORCE_GPU_SUCCESS(musaFreeHost(allocation->ptr()));
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(cudaFreeHost(allocation->ptr()));
 #endif
@@ -40,7 +40,7 @@ phi::Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) {
 #ifdef PADDLE_WITH_HIP
   PADDLE_ENFORCE_GPU_SUCCESS(hipHostMalloc(&ptr, size, hipHostMallocPortable));
 #elif defined(PADDLE_WITH_MUSA)
-  PADDLE_ENFORCE_GPU_SUCCESS(musaHostMalloc(&ptr, size, musaHostMallocPortable));
+  PADDLE_ENFORCE_GPU_SUCCESS(musaHostAlloc(&ptr, size, musaHostAllocPortable));
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(cudaHostAlloc(&ptr, size, cudaHostAllocPortable));
 #endif
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
index 9f513448eea26..ae9738ee2afd8 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
@@ -86,6 +86,16 @@ bool StreamSafeCUDAAllocation::CanBeFreed() {
     }
     PADDLE_ENFORCE_GPU_SUCCESS(err);
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(event));
+#elif defined(PADDLE_WITH_MUSA)
+    gpuError_t err = musaEventQuery(event);
+    if (err == musaErrorNotReady) {
+      VLOG(9) << "Event " << event << " for " << ptr() << " is not completed";
+      // Erase the completded event before "it"
+      outstanding_event_map_.erase(outstanding_event_map_.begin(), it);
+      return false;
+    }
+    PADDLE_ENFORCE_GPU_SUCCESS(err);
+    PADDLE_ENFORCE_GPU_SUCCESS(musaEventDestroy(event));
 #else
     gpuError_t err = hipEventQuery(event);
     if (err == hipErrorNotReady) {
@@ -122,6 +132,9 @@ void StreamSafeCUDAAllocation::RecordStreamWithNoGraphCapturing(
 #ifdef PADDLE_WITH_CUDA
     PADDLE_ENFORCE_GPU_SUCCESS(
         cudaEventCreateWithFlags(&new_event, cudaEventDisableTiming));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        musaEventCreateWithFlags(&new_event, musaEventDisableTiming));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(
         hipEventCreateWithFlags(&new_event, hipEventDisableTiming));
@@ -136,6 +149,8 @@ void StreamSafeCUDAAllocation::RecordStreamWithNoGraphCapturing(
 
 #ifdef PADDLE_WITH_CUDA
   PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(record_event, stream));
+#elif defined(PADDLE_WITH_MUSA)
+  PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(record_event, stream));
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(record_event, stream));
 #endif
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
index 08ecdd4969730..0ab0e932cc6f9 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
@@ -17,6 +17,7 @@
 #include <list>
 #include <map>
 #include <set>
+#include <mutex>
 
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/memory/allocation/spin_lock.h"
@@ -24,6 +25,8 @@
 
 #ifdef PADDLE_WITH_CUDA
 #include <cuda_runtime.h>
+#elif defined(PADDLE_WITH_MUSA)
+#include <musa_runtime.h>
 #else
 #include <hip/hip_runtime.h>
 #endif
diff --git a/paddle/fluid/memory/allocation/system_allocator.cc b/paddle/fluid/memory/allocation/system_allocator.cc
index d67df333cfaba..2a26ff170ffdf 100644
--- a/paddle/fluid/memory/allocation/system_allocator.cc
+++ b/paddle/fluid/memory/allocation/system_allocator.cc
@@ -217,7 +217,7 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
 #ifdef PADDLE_WITH_HIP
   hipError_t result = hipHostMalloc(&p, size, hipHostMallocPortable);
 #elif defined(PADDLE_WITH_MUSA)
-  musaError_t result = musaHostMalloc(&p, size, musaHostMallocPortable);
+  musaError_t result = musaHostAlloc(&p, size, musaHostAllocPortable);
 #else
   cudaError_t result = cudaHostAlloc(&p, size, cudaHostAllocPortable);
 #endif
@@ -261,9 +261,18 @@ void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
         platform::errors::Fatal(
             "hipFreeHost failed in GPUPinnedAllocator, error code is %d", err));
   }
+#elif defined(PADDLE_WITH_MUSA)
+  err = musaFreeHost(p);
+  if (err != musaErrorMusartUnloading) {
+    PADDLE_ENFORCE_EQ(
+        err,
+        0,
+        platform::errors::Fatal(
+            "musaFreeHost failed in GPUPinnedAllocator, error code is %d",
+            err));
+  }
 #else
   err = cudaFreeHost(p);
-
   // Purposefully allow cudaErrorCudartUnloading, because
   // that is returned if you ever call cudaFreeHost after the
   // driver has already shutdown. This happens only if the
diff --git a/paddle/fluid/operators/affine_channel_op.cu b/paddle/fluid/operators/affine_channel_op.cu
index 6ec8d77da2c85..62c270bfd0311 100644
--- a/paddle/fluid/operators/affine_channel_op.cu
+++ b/paddle/fluid/operators/affine_channel_op.cu
@@ -16,6 +16,10 @@ limitations under the License. */
 #include "cub/cub.cuh"
 #endif
 
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
+
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/fluid/operators/batch_norm_op.cu b/paddle/fluid/operators/batch_norm_op.cu
index 012edde57294a..1272a83b2b147 100644
--- a/paddle/fluid/operators/batch_norm_op.cu
+++ b/paddle/fluid/operators/batch_norm_op.cu
@@ -19,6 +19,9 @@ limitations under the License. */
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/fluid/operators/detection/bbox_util.cu.h b/paddle/fluid/operators/detection/bbox_util.cu.h
index adb60a8a8d064..c5ea2218d996e 100644
--- a/paddle/fluid/operators/detection/bbox_util.cu.h
+++ b/paddle/fluid/operators/detection/bbox_util.cu.h
@@ -19,6 +19,9 @@ limitations under the License. */
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
index b2bbd9c82095c..eba1c5127b8a9 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
@@ -12,6 +12,9 @@ limitations under the License. */
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h
index c69acb89750c9..1feb5a5e1fc71 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -32,9 +32,11 @@ limitations under the License. */
 #include "paddle/phi/kernels/cpu/elementwise.h"
 #include "paddle/phi/kernels/cpu/elementwise_grad.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #ifdef __NVCC__
 #include <cuda.h>
+#elif defined(__MUSACC__)
+#include <musa.h>
 #elif defined(__HIPCC__)
 #include <hip/hip_runtime.h>
 #endif
@@ -311,7 +313,7 @@ static void FusedElemwiseAndActBroadcast2CPU(const T *x,
   }
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 template <typename T,
           typename CompoundFunctor,
           bool BcastY,
@@ -516,7 +518,7 @@ void FusedElemwiseAndActComputeWithBroadcast(
     int h = pre;
     int w = n;
     if (platform::is_gpu_place(ctx.GetPlace())) {
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
       FusedElemwiseAndActBroadcast1CUDA<T,
                                         CompoundFunctor,
                                         BcastY,
@@ -551,7 +553,7 @@ void FusedElemwiseAndActComputeWithBroadcast(
     }
   } else {
     if (platform::is_gpu_place(ctx.GetPlace())) {
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
       FusedElemwiseAndActBroadcast2CUDA<T,
                                         CompoundFunctor,
                                         BcastY,
@@ -880,7 +882,7 @@ static void FusedElemwiseAndActGradBroadcast2CPU(
   }
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 template <typename T,
           typename DX_OP,
           typename DY_OP,
@@ -1273,7 +1275,7 @@ void FusedElemwiseAndActGradComputeWithBroadcast(
     int w = n;
 
     if (platform::is_gpu_place(ctx.GetPlace())) {
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
       FusedElemwiseAndActGradBroadcast1CUDA<T,
                                             DX_OP,
                                             DY_OP,
@@ -1324,7 +1326,7 @@ void FusedElemwiseAndActGradComputeWithBroadcast(
     }
   } else {
     if (platform::is_gpu_place(ctx.GetPlace())) {
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
       FusedElemwiseAndActGradBroadcast2CUDA<T,
                                             DX_OP,
                                             DY_OP,
@@ -1594,7 +1596,7 @@ static inline std::vector<int> GetReduceDim(const framework::DDim &in,
   return phi::funcs::GetReduceDim(in, out, axis);
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 
 template <typename T, typename Functor>
 void GetGradXAndYOut(const phi::GPUContext &dev_ctx,
diff --git a/paddle/fluid/operators/expand_as_op.cc b/paddle/fluid/operators/expand_as_op.cc
index f0d31269da193..f1ebaa147494b 100644
--- a/paddle/fluid/operators/expand_as_op.cc
+++ b/paddle/fluid/operators/expand_as_op.cc
@@ -155,7 +155,7 @@ REGISTER_OP_CPU_KERNEL(expand_as_grad,
                        ops::ExpandAsGradKernel<phi::CPUContext, int64_t>,
                        ops::ExpandAsGradKernel<phi::CPUContext, float>,
                        ops::ExpandAsGradKernel<phi::CPUContext, double>);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 REGISTER_OP_CUDA_KERNEL(expand_as,
                         ops::ExpandAsKernel<phi::GPUContext, float>,
                         ops::ExpandAsKernel<phi::GPUContext, double>,
diff --git a/paddle/fluid/operators/expand_op.cc b/paddle/fluid/operators/expand_op.cc
index 490c6f9f6dbfc..54af38ee3d429 100644
--- a/paddle/fluid/operators/expand_op.cc
+++ b/paddle/fluid/operators/expand_op.cc
@@ -283,7 +283,7 @@ REGISTER_OP_CPU_KERNEL(expand_grad,
                        ops::ExpandGradKernel<phi::CPUContext, double>,
                        ops::ExpandGradKernel<phi::CPUContext, int>,
                        ops::ExpandGradKernel<phi::CPUContext, int64_t>);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 REGISTER_OP_CUDA_KERNEL(
     expand,
     ops::ExpandKernel<phi::GPUContext, float>,
diff --git a/paddle/fluid/operators/fused/attn_bias_add.cu.h b/paddle/fluid/operators/fused/attn_bias_add.cu.h
index 53001b2493084..b8d66efffee0a 100644
--- a/paddle/fluid/operators/fused/attn_bias_add.cu.h
+++ b/paddle/fluid/operators/fused/attn_bias_add.cu.h
@@ -17,6 +17,9 @@ limitations under the License. */
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
index dee676a7640f4..4eea6ab366fb6 100644
--- a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
+++ b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
@@ -15,6 +15,9 @@ limitations under the License. */
 #ifdef __NVCC__
 #include <cub/cub.cuh>
 #endif
+#ifdef __MUSACC__
+#include <cub/cub.cuh>
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/fluid/operators/fused_token_prune_op.cu b/paddle/fluid/operators/fused_token_prune_op.cu
index 8f0a53611f3b2..4ff5fd33df3d6 100644
--- a/paddle/fluid/operators/fused_token_prune_op.cu
+++ b/paddle/fluid/operators/fused_token_prune_op.cu
@@ -14,6 +14,9 @@ limitations under the License. */
 #ifdef __NVCC__
 #include <cub/cub.cuh>
 #endif
+#ifdef __MUSACC__
+#include <cub/cub.cuh>
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc
index c6a8a4fe7b982..471428b0b44ee 100644
--- a/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc
+++ b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc
@@ -111,7 +111,7 @@ PD_REGISTER_STRUCT_KERNEL(get_tensor_from_selected_rows,
                           int,
                           int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 PD_REGISTER_STRUCT_KERNEL(get_tensor_from_selected_rows,
                           GPU,
                           ALL_LAYOUT,
diff --git a/paddle/fluid/operators/hinge_loss_op.cc b/paddle/fluid/operators/hinge_loss_op.cc
index ea38db87e63e7..329e7aa0f0607 100644
--- a/paddle/fluid/operators/hinge_loss_op.cc
+++ b/paddle/fluid/operators/hinge_loss_op.cc
@@ -156,7 +156,7 @@ PD_REGISTER_STRUCT_KERNEL(
 PD_REGISTER_STRUCT_KERNEL(
     hinge_loss_grad, CPU, ALL_LAYOUT, ops::HingeLossGradKernel, float) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 PD_REGISTER_STRUCT_KERNEL(
     hinge_loss, GPU, ALL_LAYOUT, ops::HingeLossKernel, float) {}
 PD_REGISTER_STRUCT_KERNEL(
diff --git a/paddle/fluid/operators/im2sequence_op.cc b/paddle/fluid/operators/im2sequence_op.cc
index e1e9ca5ef6667..860aee6d9e426 100644
--- a/paddle/fluid/operators/im2sequence_op.cc
+++ b/paddle/fluid/operators/im2sequence_op.cc
@@ -201,7 +201,7 @@ PD_REGISTER_STRUCT_KERNEL(
 PD_REGISTER_STRUCT_KERNEL(
     im2sequence_grad, CPU, ALL_LAYOUT, ops::Im2SequenceGradKernel, float) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 PD_REGISTER_STRUCT_KERNEL(
     im2sequence, GPU, ALL_LAYOUT, ops::Im2SequenceKernel, float) {}
 PD_REGISTER_STRUCT_KERNEL(
diff --git a/paddle/fluid/operators/isfinite_op.h b/paddle/fluid/operators/isfinite_op.h
index 940b3eaac0c10..42f79646a670a 100644
--- a/paddle/fluid/operators/isfinite_op.h
+++ b/paddle/fluid/operators/isfinite_op.h
@@ -67,7 +67,7 @@ bool TensorIsfinite(const phi::DenseTensor& tensor);
 FiniteVisitor(Isnan, Any, CPU);
 FiniteVisitor(Isinf, Any, CPU);
 FiniteVisitor(Isfinite, All, CPU);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 FiniteVisitor(Isnan, Any, GPU);
 FiniteVisitor(Isinf, Any, GPU);
 FiniteVisitor(Isfinite, All, GPU);
@@ -82,7 +82,7 @@ inline void TensorContainsNAN(const phi::DenseTensor& tensor,
                         IsnanVisitorCPU(tensor, out));
     return;
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
   if (platform::is_gpu_place(place)) {
     VisitDataTypeNormal(paddle::framework::TransToProtoVarType(tensor.dtype()),
                         IsnanVisitorGPU(tensor, out));
@@ -99,7 +99,7 @@ inline void TensorContainsInf(const phi::DenseTensor& tensor,
                         IsinfVisitorCPU(tensor, out));
     return;
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
   if (platform::is_gpu_place(place)) {
     VisitDataTypeNormal(paddle::framework::TransToProtoVarType(tensor.dtype()),
                         IsinfVisitorGPU(tensor, out));
@@ -116,7 +116,7 @@ inline void TensorIsfinite(const phi::DenseTensor& tensor,
                         IsfiniteVisitorCPU(tensor, out));
     return;
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
   if (platform::is_gpu_place(place)) {
     VisitDataTypeNormal(paddle::framework::TransToProtoVarType(tensor.dtype()),
                         IsfiniteVisitorGPU(tensor, out));
diff --git a/paddle/fluid/operators/l1_norm_op.cc b/paddle/fluid/operators/l1_norm_op.cc
index 2c6d72f109c13..1cab5b2551b80 100644
--- a/paddle/fluid/operators/l1_norm_op.cc
+++ b/paddle/fluid/operators/l1_norm_op.cc
@@ -96,7 +96,7 @@ PD_REGISTER_STRUCT_KERNEL(l1_norm, CPU, ALL_LAYOUT, ops::L1NormKernel, float) {}
 PD_REGISTER_STRUCT_KERNEL(
     l1_norm_grad, CPU, ALL_LAYOUT, ops::L1NormGradKernel, float) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 PD_REGISTER_STRUCT_KERNEL(l1_norm, GPU, ALL_LAYOUT, ops::L1NormKernel, float) {}
 PD_REGISTER_STRUCT_KERNEL(
     l1_norm_grad, GPU, ALL_LAYOUT, ops::L1NormGradKernel, float) {}
diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc
index 197aaa74bb3e1..8a8a705b629bf 100644
--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
@@ -133,7 +133,7 @@ PD_REGISTER_KERNEL(load, CPU, ALL_LAYOUT, ops::LoadKernel, float) {}
 PD_REGISTER_KERNEL(
     load_sr, CPU, ALL_LAYOUT, ops::LoadSelectedRowsKernel, float) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 PD_REGISTER_KERNEL(load, GPU, ALL_LAYOUT, ops::LoadKernel, float) {}
 PD_REGISTER_KERNEL(
     load_sr, GPU, ALL_LAYOUT, ops::LoadSelectedRowsKernel, float) {}
diff --git a/paddle/fluid/operators/math/inclusive_scan.h b/paddle/fluid/operators/math/inclusive_scan.h
index 3032b78a2029d..2718a0079ed19 100644
--- a/paddle/fluid/operators/math/inclusive_scan.h
+++ b/paddle/fluid/operators/math/inclusive_scan.h
@@ -17,6 +17,9 @@
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc
index 5208d0b2cf937..5394c755e56df 100644
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -75,7 +75,7 @@ class MatMulKernel : public framework::OpKernel<T> {
 
     int head_number = 1;
 #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
-    !defined(PADDLE_WITH_HIP)
+    !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
     head_number = context.Attr<int>("head_number");
 #endif
 
@@ -89,7 +89,7 @@ class MatMulKernel : public framework::OpKernel<T> {
       }
     }
 #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
-    !defined(PADDLE_WITH_HIP)
+    !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
     bool split_vertical_y = (mat_dim_a.width_ != mat_dim_b.height_);
 
     if (head_number > 1) {
@@ -241,7 +241,7 @@ class MatMulGradKernel : public framework::OpKernel<T> {
 
     int head_number = 1;
 #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
-    !defined(PADDLE_WITH_HIP)
+    !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
     if (context.HasAttr("head_number")) {
       head_number = context.Attr<int>("head_number");
     }
@@ -373,7 +373,7 @@ class MatMulDoubleGradKernel : public framework::OpKernel<T> {
 
     int head_number = 1;
 #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
-    !defined(PADDLE_WITH_HIP)
+    !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
     head_number = context.Attr<int>("head_number");
 #endif
 
@@ -615,7 +615,7 @@ class MatMulOp : public framework::OperatorWithKernel {
     }
     int64_t dim_out_y = mat_dim_y.width_;
 #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
-    !defined(PADDLE_WITH_HIP)
+    !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
     int head_number = context->Attrs().Get<int>("head_number");
     bool split_vertical_y = (mat_dim_x.width_ != mat_dim_y.height_);
     if (context->IsRuntime()) {
@@ -758,7 +758,7 @@ class MatMulOpMaker : public framework::OpProtoAndCheckerMaker {
         .AsExtra();
 
 #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
-    !defined(PADDLE_WITH_HIP)
+    !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
     AddAttr<int>("head_number", "The number of heads of the matrix")
         .SetDefault(1);
 #endif
@@ -926,7 +926,7 @@ REGISTER_OP_CPU_KERNEL(matmul_grad_grad,
                        ops::MatMulDoubleGradKernel<phi::CPUContext, float>,
                        ops::MatMulDoubleGradKernel<phi::CPUContext, double>);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 REGISTER_OP_CUDA_KERNEL(
     matmul,
     ops::MatMulKernel<phi::GPUContext, float>,
diff --git a/paddle/fluid/operators/minus_op.cc b/paddle/fluid/operators/minus_op.cc
index 27a38571e1c80..6d4960b22411b 100644
--- a/paddle/fluid/operators/minus_op.cc
+++ b/paddle/fluid/operators/minus_op.cc
@@ -157,6 +157,6 @@ REGISTER_OPERATOR(minus,
                   ops::MinusGradMaker);
 PD_REGISTER_STRUCT_KERNEL(minus, CPU, ALL_LAYOUT, ops::MinusKernel, float) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 PD_REGISTER_STRUCT_KERNEL(minus, GPU, ALL_LAYOUT, ops::MinusKernel, float) {}
 #endif
diff --git a/paddle/fluid/operators/mudnn_rnn_cache.h b/paddle/fluid/operators/mudnn_rnn_cache.h
new file mode 100644
index 0000000000000..af9ebd800fa3c
--- /dev/null
+++ b/paddle/fluid/operators/mudnn_rnn_cache.h
@@ -0,0 +1,33 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+
+namespace paddle {
+namespace operators {
+
+struct CudnnRNNCache {
+  CudnnRNNCache() {
+  }
+  ~CudnnRNNCache() {}
+};
+
+}  // namespace operators
+}  // namespace paddle
+
diff --git a/paddle/fluid/operators/nop_op.cc b/paddle/fluid/operators/nop_op.cc
index e99b3956d05b0..45d44e71b5775 100644
--- a/paddle/fluid/operators/nop_op.cc
+++ b/paddle/fluid/operators/nop_op.cc
@@ -60,6 +60,6 @@ REGISTER_OP_WITHOUT_GRADIENT(nop, ops::NopOp, ops::NopOpMaker);
 
 PD_REGISTER_STRUCT_KERNEL(nop, CPU, ALL_LAYOUT, ops::NopKernel, float) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 PD_REGISTER_STRUCT_KERNEL(nop, GPU, ALL_LAYOUT, ops::NopKernel, float) {}
 #endif
diff --git a/paddle/fluid/operators/optimizers/sparse_momentum_op.h b/paddle/fluid/operators/optimizers/sparse_momentum_op.h
index f1b162be46610..1f3ae2f9e318e 100644
--- a/paddle/fluid/operators/optimizers/sparse_momentum_op.h
+++ b/paddle/fluid/operators/optimizers/sparse_momentum_op.h
@@ -28,6 +28,9 @@
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
@@ -460,7 +463,7 @@ class SparseMomentumOpKernel : public framework::OpKernel<T> {
         grad_index.mutable_data<IndexT>({num_index}, ctx.GetPlace());
 
     if (platform::is_gpu_place(ctx.GetPlace())) {
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
       auto sort_value_ptr =
           sort_value.mutable_data<IndexT>({num_index}, ctx.GetPlace());
 
diff --git a/paddle/fluid/operators/pad_constant_like_op.cc b/paddle/fluid/operators/pad_constant_like_op.cc
index 72061fbc39630..ea090c6cdb40a 100644
--- a/paddle/fluid/operators/pad_constant_like_op.cc
+++ b/paddle/fluid/operators/pad_constant_like_op.cc
@@ -260,7 +260,7 @@ PD_REGISTER_STRUCT_KERNEL(pad_constant_like_grad,
                           int,
                           int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 PD_REGISTER_STRUCT_KERNEL(pad_constant_like,
                           GPU,
                           ALL_LAYOUT,
diff --git a/paddle/fluid/operators/prroi_pool_op.h b/paddle/fluid/operators/prroi_pool_op.h
index e2417a071ce88..a10f59f8a2fbe 100644
--- a/paddle/fluid/operators/prroi_pool_op.h
+++ b/paddle/fluid/operators/prroi_pool_op.h
@@ -17,7 +17,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 #endif
 
@@ -85,7 +85,7 @@ inline HOSTDEVICE T PrRoIPoolingMatCalculation(const T* this_data,
   return sum_out;
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 template <typename T>
 DEVICE void PrRoIPoolingDistributeDiff(T* diff,
                                        const T top_diff,
@@ -163,7 +163,7 @@ HOSTDEVICE void PrRoIPoolingMatDistributeDiff(T* diff,
   PrRoIPoolingDistributeDiff<T>(diff, top_diff, e_h, e_w, h0, w0, tmp);
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 template <typename T>
 DEVICE void AccumulateRois(T* offset, T data) {
   phi::CudaAtomicAdd(offset, data);
@@ -175,7 +175,7 @@ inline HOSTDEVICE void AccumulateRois(T* offset, T data) {
 }
 #endif
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 template <typename T>
 DEVICE T MaxFunctor(const T x, const T y) {
   return max(x, y);
diff --git a/paddle/fluid/operators/rank_loss_op.cc b/paddle/fluid/operators/rank_loss_op.cc
index b9f05d663dba0..4d24896d37000 100644
--- a/paddle/fluid/operators/rank_loss_op.cc
+++ b/paddle/fluid/operators/rank_loss_op.cc
@@ -246,7 +246,7 @@ PD_REGISTER_STRUCT_KERNEL(
 PD_REGISTER_STRUCT_KERNEL(
     rank_loss_grad, CPU, ALL_LAYOUT, ops::RankLossGradKernel, float) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 PD_REGISTER_STRUCT_KERNEL(
     rank_loss, GPU, ALL_LAYOUT, ops::RankLossKernel, float) {}
 PD_REGISTER_STRUCT_KERNEL(
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index a089ad7d58fac..1cd5ef11909a0 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -761,7 +761,7 @@ REGISTER_OPERATOR(reshape2_grad_grad,
                   ops::ReshapeDoubleGradOpNoNeedBufferVarInferer,
                   Reshape2DoubleGradInferShapeFunctor);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape,
                                 float,
                                 ops::ReshapeKernel,
diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc
index ab03d46486c2e..d5727d9eb9936 100644
--- a/paddle/fluid/operators/save_op.cc
+++ b/paddle/fluid/operators/save_op.cc
@@ -117,7 +117,7 @@ PD_REGISTER_KERNEL(save_sr,
                    phi::dtype::float16,
                    phi::dtype::bfloat16) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 PD_REGISTER_KERNEL(save,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
index 897ff207f5eca..7411ecc05358c 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
@@ -18,6 +18,10 @@ limitations under the License. */
 #include <cub/cub.cuh>
 #endif
 
+#ifdef __MUSACC__
+#include <cub/cub.cuh>
+#endif
+
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/fluid/operators/svd_helper.h b/paddle/fluid/operators/svd_helper.h
index ccf5cd09a0842..e0004f197cd55 100644
--- a/paddle/fluid/operators/svd_helper.h
+++ b/paddle/fluid/operators/svd_helper.h
@@ -478,7 +478,7 @@ struct DeviceIndependenceTensorOperations {
     std::vector<int> out_shape = GetBroadcastShape({&x, &y});
     ret.Resize(phi::make_ddim(out_shape));
     if (platform::is_gpu_place(context.GetPlace())) {
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
       // For GPU, there is no need to define XxxInverseFunctor and call
       // ElementwiseComputeEx in two branches.
       ElementwiseComputeEx<SubFunctor<InT>, DeviceContext, InT>(
diff --git a/paddle/fluid/operators/sync_batch_norm_utils.h b/paddle/fluid/operators/sync_batch_norm_utils.h
index 7c14f6dfac324..ebc825b66a5ef 100644
--- a/paddle/fluid/operators/sync_batch_norm_utils.h
+++ b/paddle/fluid/operators/sync_batch_norm_utils.h
@@ -22,6 +22,9 @@ limitations under the License. */
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu
index f1674bc5005a0..fede7fe5156d0 100644
--- a/paddle/fluid/operators/top_k_op.cu
+++ b/paddle/fluid/operators/top_k_op.cu
@@ -18,6 +18,9 @@ limitations under the License. */
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 #endif
diff --git a/paddle/fluid/operators/uniform_random_op.h b/paddle/fluid/operators/uniform_random_op.h
index 16bce515f2a7f..12725c397faf6 100644
--- a/paddle/fluid/operators/uniform_random_op.h
+++ b/paddle/fluid/operators/uniform_random_op.h
@@ -19,7 +19,7 @@
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #include <thrust/random.h>
 
 #include "paddle/phi/core/generator.h"
@@ -113,7 +113,7 @@ inline std::vector<int64_t> GetNewDataFromShapeTensorList(
   return vec_new_shape;
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 
 template <typename T>
 struct UniformGenerator {
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 4d7f496aaa42d..527e7396fa488 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -64,7 +64,7 @@ if(WITH_DGC)
   set(dgc_deps dgc)
 endif()
 
-if(WITH_GPU OR WITH_ROCM)
+if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
   set(GPU_CTX_DEPS dynload_cuda dynamic_loader)
 endif()
 
@@ -90,8 +90,15 @@ if(WITH_ROCM)
     SRCS stream_callback_manager.cc
     DEPS simple_threadpool enforce)
 endif()
+if(WITH_MUSA)
+  musa_library(
+    stream_callback_manager
+    SRCS stream_callback_manager.cc
+    DEPS simple_threadpool enforce)
+endif()
+
 
-if(WITH_GPU OR WITH_ROCM)
+if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
   set(STREAM_CALLBACK_DEPS stream_callback_manager)
 else()
   set(STREAM_CALLBACK_DEPS)
@@ -137,7 +144,7 @@ cc_library(
   SRCS collective_helper.cc gen_comm_id_helper.cc
   DEPS framework_proto device_context enforce)
 
-if(WITH_GPU OR WITH_ROCM)
+if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
   target_link_libraries(device_context gpu_resource_pool)
 endif()
 
@@ -235,6 +242,16 @@ if(WITH_ROCM)
     DEPS device_context gpu_info)
 endif()
 
+if(WITH_MUSA)
+  musa_library(
+    device_event_gpu
+    SRCS device_event_gpu.cc
+    DEPS device_event_base)
+  set(DEVICE_EVENT_LIBS
+      device_event_gpu
+      CACHE INTERNAL "device event libs")
+endif()
+
 cc_library(timer SRCS timer.cc)
 cc_test(
   timer_test
@@ -339,6 +356,13 @@ if(WITH_GPU)
     DEPS gpu_info)
 endif()
 
+if(WITH_MUSA)
+  musa_library(
+    cuda_device_guard
+    SRCS cuda_device_guard.cc
+    DEPS gpu_info)
+endif()
+
 if(WITH_ROCM)
   hip_test(
     float16_gpu_test
diff --git a/paddle/fluid/platform/device/CMakeLists.txt b/paddle/fluid/platform/device/CMakeLists.txt
index 6f0d86f0a4b17..b782a45047117 100644
--- a/paddle/fluid/platform/device/CMakeLists.txt
+++ b/paddle/fluid/platform/device/CMakeLists.txt
@@ -1,7 +1,7 @@
 set(DEV_LIBS custom_device)
 
 # GPU
-if(WITH_GPU OR WITH_ROCM)
+if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
   add_subdirectory(gpu)
 endif()
 
diff --git a/paddle/fluid/platform/device/gpu/CMakeLists.txt b/paddle/fluid/platform/device/gpu/CMakeLists.txt
index 897f8d3732b73..f992901a46fd5 100644
--- a/paddle/fluid/platform/device/gpu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/gpu/CMakeLists.txt
@@ -22,6 +22,12 @@ elseif(WITH_ROCM)
     cudnn_desc_test
     SRCS cudnn_desc_test.cc
     DEPS dynload_cuda)
+elseif(WITH_MUSA)
+  musa_library(
+    gpu_info
+    SRCS gpu_info.cc
+    DEPS phi glog enforce monitor dynload_cuda)
+
 endif()
 
 cc_library(
diff --git a/paddle/fluid/platform/device/gpu/gpu_dnn.h b/paddle/fluid/platform/device/gpu/gpu_dnn.h
index 2a9db61f83bc6..f6f6392c4c23d 100644
--- a/paddle/fluid/platform/device/gpu/gpu_dnn.h
+++ b/paddle/fluid/platform/device/gpu/gpu_dnn.h
@@ -16,7 +16,7 @@
 
 #include "paddle/phi/backends/gpu/gpu_dnn.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc b/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc
index d8e9197bf6ea5..0fb7e061e3243 100644
--- a/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc
@@ -102,7 +102,7 @@ CudaEventResourcePool::CudaEventResourcePool() {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event));
 #elif defined(PADDLE_WITH_MUSA)
-      PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event));
+      PADDLE_ENFORCE_GPU_SUCCESS(musaEventDestroy(event));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(event));
 #endif
diff --git a/paddle/fluid/platform/device/gpu/gpu_types.h b/paddle/fluid/platform/device/gpu/gpu_types.h
index ba7b1ede735fe..060a9161c46ad 100644
--- a/paddle/fluid/platform/device/gpu/gpu_types.h
+++ b/paddle/fluid/platform/device/gpu/gpu_types.h
@@ -26,6 +26,8 @@
 #elif defined(PADDLE_WITH_MUSA)
 #include <musa_runtime.h>
 #include <mublas.h>
+#include <mudnn.h>
+using mudnnHandle_t = ::musa::dnn::Handle*;
 //TODO(Xiaokang Shang)
 #else
 #include <cuda_runtime.h>
@@ -91,9 +93,10 @@ DECLARE_TYPE_FOR_GPU(dnnPoolingMode_t, cudnnPoolingMode_t, miopenPoolingMode_t);
 DECLARE_TYPE_FOR_GPU(dnnDropoutDescriptor_t,
                      cudnnDropoutDescriptor_t,
                      miopenDropoutDescriptor_t);
-DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t, mudnnHandle_t);
 #endif
 
+DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle, mublasHandle_t); // TODO(MTAI)
+DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t, mudnnHandle_t);
 DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle, mublasHandle_t);
 
 // TODO(Ming Huang): Since there is no blasLt handler,
diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt
index 4cb3bfdb3adae..beac4eb9261a0 100644
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -39,6 +39,9 @@ if(NOT APPLE)
       list(APPEND HIP_SRCS cupti.cc)
     endif()
   endif()
+  if(WITH_MUSA)
+    list(APPEND MUSA_SRCS musa_driver.cc musartc.cc)
+  endif()
 endif()
 
 if(TENSORRT_FOUND)
@@ -62,6 +65,15 @@ if(WITH_ROCM)
     dynload_warpctc
     SRCS warpctc.cc
     DEPS dynamic_loader warpctc phi)
+elseif(WITH_MUSA)
+  musa_library(
+    dynload_cuda
+    SRCS ${MUSA_SRCS}
+    DEPS dynamic_loader phi)
+  cc_library(
+    dynload_warpctc
+    SRCS warpctc.cc
+    DEPS dynamic_loader warpctc phi)
 else()
   nv_library(
     dynload_cuda
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index d1b557922af32..c23abcee9d725 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -698,7 +698,7 @@ void EnableProfiler(ProfilerState state) {
   HostTraceLevel::GetInstance().SetLevel(option.trace_level);
   should_send_profile_state = true;
   phi::GetDeviceTracer()->Enable();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
   if (phi::ProfilerHelper::g_state == ProfilerState::kCUDA ||
       phi::ProfilerHelper::g_state == ProfilerState::kAll ||
       phi::ProfilerHelper::g_state == ProfilerState::kCPU) {
diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc
index c55bcb71a7d43..b5f593193bfc2 100644
--- a/paddle/fluid/platform/stream_callback_manager.cc
+++ b/paddle/fluid/platform/stream_callback_manager.cc
@@ -24,6 +24,11 @@ static void StreamCallbackFunc(gpuStream_t stream,
                                gpuError_t status,
                                void *user_data)
 #endif
+#ifdef PADDLE_WITH_MUSA
+static void StreamCallbackFunc(gpuStream_t stream,
+                               gpuError_t status,
+                               void *user_data)
+#endif
 #ifdef PADDLE_WITH_CUDA
 #if CUDA_VERSION >= 10000
     static void CUDART_CB StreamCallbackFunc(void *user_data)
@@ -58,6 +63,11 @@ void StreamCallbackManager<Stream>::AddCallback(
   PADDLE_ENFORCE_GPU_SUCCESS(
       hipStreamAddCallback(stream_, StreamCallbackFunc, func, 0));
 #endif
+#ifdef PADDLE_WITH_MUSA
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      musaStreamAddCallback(stream_, StreamCallbackFunc, func, 0));
+      //musaLaunchHostFunc(stream_, StreamCallbackFunc, func));
+#endif
 #ifdef PADDLE_WITH_CUDA
 #if CUDA_VERSION >= 10000
   PADDLE_ENFORCE_GPU_SUCCESS(
@@ -71,7 +81,7 @@ void StreamCallbackManager<Stream>::AddCallback(
 
 template <typename Stream>
 void StreamCallbackManager<Stream>::Wait() const {
-#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_MUSA)
   platform::GpuStreamSync(stream_);
 #endif
   {
@@ -85,6 +95,9 @@ void StreamCallbackManager<Stream>::Wait() const {
 #ifdef PADDLE_WITH_CUDA
 template struct StreamCallbackManager<gpuStream_t>;
 #endif
+#ifdef PADDLE_WITH_MUSA
+template struct StreamCallbackManager<gpuStream_t>;
+#endif
 #ifdef PADDLE_WITH_HIP
 template struct StreamCallbackManager<hipStream_t>;
 #endif
diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt
index 593109d3e8e27..139642f5b6b65 100644
--- a/paddle/phi/CMakeLists.txt
+++ b/paddle/phi/CMakeLists.txt
@@ -51,6 +51,15 @@ if(WITH_GPU)
   list(APPEND PHI_DEPS external_error_proto)
 endif()
 
+if(WITH_MUSA)
+  set(DEPENDENT_LIBRARIES "")
+  list(APPEND DEPENDENT_LIBRARIES "/usr/local/musa/lib/libmudnn.so")
+  list(APPEND DEPENDENT_LIBRARIES "/usr/local/musa/lib/libmccl.so")
+  list(APPEND DEPENDENT_LIBRARIES "/usr/local/musa/lib/libmusart.so")
+  list(APPEND DEPENDENT_LIBRARIES "/usr/local/musa/lib/libmublas.so")
+  list(APPEND PHI_DEPS ${DEPENDENT_LIBRARIES})
+endif()
+
 if(WITH_ASCEND_CL)
   list(APPEND PHI_DEPS npu_hccl)
 endif()
diff --git a/paddle/phi/backends/device_code.cc b/paddle/phi/backends/device_code.cc
index 33b8f3a320aac..97279e2d0f76c 100644
--- a/paddle/phi/backends/device_code.cc
+++ b/paddle/phi/backends/device_code.cc
@@ -80,7 +80,7 @@ DeviceCodePool::DeviceCodePool(const std::vector<phi::Place>& places) {
   }
   for (auto& p : set) {
     if (p.GetType() == phi::AllocationType::GPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
       device_codes_.emplace(p, DeviceCodeMap());
 #else
       PADDLE_THROW(phi::errors::PreconditionNotMet(
@@ -90,40 +90,40 @@ DeviceCodePool::DeviceCodePool(const std::vector<phi::Place>& places) {
     }
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
   GPUDeviceCode::CheckAvailableStatus();
 #endif
 }
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
-#ifdef PADDLE_WITH_HIP
-static bool CheckCUDADriverResult(hipError_t result,
-                                  std::string caller,
-                                  std::string kernel_name = "") {
-  if (result != hipSuccess) {
-    const char* error = nullptr;
-    error = dynload::hipGetErrorString(result);
-#elif defined(PADDLE_WITH_MUSA)
-static bool CheckCUDADriverResult(MUresult result,
-                                  std::string caller,
-                                  std::string kernel_name = "") {
-  if (result != MUSA_SUCCESS) {
-    const char* error = nullptr;
-    muGetErrorString(result, &error);
-#else
-static bool CheckCUDADriverResult(CUresult result,
-                                  std::string caller,
-                                  std::string kernel_name = "") {
-  if (result != CUDA_SUCCESS) {
-    const char* error = nullptr;
-    dynload::cuGetErrorString(result, &error);
-#endif
-    LOG_FIRST_N(WARNING, 1) << "Call " << caller << " for < " << kernel_name
-                            << " > failed: " << error << " (" << result << ")";
-    return false;
-  }
-  return true;
-}
+//#ifdef PADDLE_WITH_HIP
+//static bool CheckCUDADriverResult(hipError_t result,
+//                                  std::string caller,
+//                                  std::string kernel_name = "") {
+//  if (result != hipSuccess) {
+//    const char* error = nullptr;
+//    error = dynload::hipGetErrorString(result);
+//#elif defined(PADDLE_WITH_MUSA)
+////static bool CheckCUDADriverResult(MUresult result,
+////                                  std::string caller,
+////                                  std::string kernel_name = "") {
+////  if (result != MUSA_SUCCESS) {
+////    const char* error = nullptr;
+////    muGetErrorString(result, &error);
+//#else
+//static bool CheckCUDADriverResult(CUresult result,
+//                                  std::string caller,
+//                                  std::string kernel_name = "") {
+//  if (result != CUDA_SUCCESS) {
+//    const char* error = nullptr;
+//    dynload::cuGetErrorString(result, &error);
+//#endif
+//    LOG_FIRST_N(WARNING, 1) << "Call " << caller << " for < " << kernel_name
+//                            << " > failed: " << error << " (" << result << ")";
+//    return false;
+//  }
+//  return true;
+//}
 
 bool GPUDeviceCode::available_ = false;
 void GPUDeviceCode::CheckAvailableStatus() {
@@ -148,19 +148,19 @@ void GPUDeviceCode::CheckAvailableStatus() {
   int driver_version = 0;
   int dirver_major = 0;
   int driver_minor = 0;
-#ifdef PADDLE_WITH_HIP
-  hipError_t driver_result = dynload::hipDriverGetVersion(&driver_version);
-  if (driver_result == hipSuccess) {
-#elif defined(PADDLE_WITH_MUSA)
-  MUresult driver_result = muDriverGetVersion(&driver_version);
-  if (driver_result == MUSA_SUCCESS) {
-#else
-  CUresult driver_result = dynload::cuDriverGetVersion(&driver_version);
-  if (driver_result == CUDA_SUCCESS) {
-#endif
-    dirver_major = driver_version / 1000;
-    driver_minor = (driver_version % 1000) / 10;
-  }
+//#ifdef PADDLE_WITH_HIP
+//  hipError_t driver_result = dynload::hipDriverGetVersion(&driver_version);
+//  if (driver_result == hipSuccess) {
+//#elif defined(PADDLE_WITH_MUSA)
+//  MUresult driver_result = muDriverGetVersion(&driver_version);
+//  if (driver_result == MUSA_SUCCESS) {
+//#else
+//  CUresult driver_result = dynload::cuDriverGetVersion(&driver_version);
+//  if (driver_result == CUDA_SUCCESS) {
+//#endif
+//    dirver_major = driver_version / 1000;
+//    driver_minor = (driver_version % 1000) / 10;
+//  }
 
   LOG_FIRST_N(INFO, 1) << "CUDA Driver Version: " << dirver_major << "."
                        << driver_minor << "; NVRTC Version: " << nvrtc_major
@@ -176,18 +176,18 @@ void GPUDeviceCode::CheckAvailableStatus() {
   }
 
   int count = 0;
-#ifdef PADDLE_WITH_HIP
-  if (CheckCUDADriverResult(dynload::hipGetDeviceCount(&count),
-                            "hipGetDeviceCount")) {
-#elif defined(PADDLE_WITH_MUSA)
-  if (CheckCUDADriverResult(muDeviceGetCount(&count),
-                            "muDeviceGetCount")) {
-#else
-  if (CheckCUDADriverResult(dynload::cuDeviceGetCount(&count),
-                            "cuDeviceGetCount")) {
-#endif
-    available_ = true;
-  }
+//#ifdef PADDLE_WITH_HIP
+//  if (CheckCUDADriverResult(dynload::hipGetDeviceCount(&count),
+//                            "hipGetDeviceCount")) {
+//#elif defined(PADDLE_WITH_MUSA)
+//  if (CheckCUDADriverResult(muDeviceGetCount(&count),
+//                            "muDeviceGetCount")) {
+//#else
+//  if (CheckCUDADriverResult(dynload::cuDeviceGetCount(&count),
+//                            "cuDeviceGetCount")) {
+//#endif
+//    available_ = true;
+//  }
 }
 
 static std::string FindCUDAIncludePath() {
@@ -465,21 +465,21 @@ void GPUDeviceCode::Launch(const size_t n, std::vector<void*>* args) const {
       errors::External("Fail to launch kernel %s (in hipModuleLaunchKernel.)",
                        name_.c_str()));
 #elif defined(PADDLE_WITH_MUSA)
-  PADDLE_ENFORCE_EQ(
-      muLaunchKernel(function_,
-                              num_blocks,
-                              1,
-                              1,  // grid dim
-                              num_threads_,
-                              1,
-                              1,                  // block dim
-                              0,                  // shared memory
-                              dev_ctx->stream(),  // stream
-                              args->data(),       // arguments
-                              nullptr),
-      MUSA_SUCCESS,
-      errors::External("Fail to launch kernel %s (in muLaunchKernel.)",
-                       name_.c_str()));
+  //PADDLE_ENFORCE_EQ(
+  //    muLaunchKernel(function_,
+  //                            num_blocks,
+  //                            1,
+  //                            1,  // grid dim
+  //                            num_threads_,
+  //                            1,
+  //                            1,                  // block dim
+  //                            0,                  // shared memory
+  //                            dev_ctx->stream(),  // stream
+  //                            args->data(),       // arguments
+  //                            nullptr),
+  //    MUSA_SUCCESS,
+  //    errors::External("Fail to launch kernel %s (in muLaunchKernel.)",
+  //                     name_.c_str()));
 #else
   PADDLE_ENFORCE_EQ(
       dynload::cuLaunchKernel(function_,
diff --git a/paddle/phi/backends/dynload/CMakeLists.txt b/paddle/phi/backends/dynload/CMakeLists.txt
index 838b623ae7b38..883e95c41985f 100644
--- a/paddle/phi/backends/dynload/CMakeLists.txt
+++ b/paddle/phi/backends/dynload/CMakeLists.txt
@@ -30,6 +30,13 @@ if(WITH_ROCM)
     rocsparse.cc)
 endif()
 
+if(WITH_MUSA)
+  list(
+    APPEND
+    MUSA_SRCS
+    mudnn.cc)
+endif()
+
 # There is no macOS version of NCCL.
 # Disable nvrtc and cuda_driver api on macOS, and only do an early test on Linux and Windows.
 if(NOT APPLE)
@@ -46,6 +53,9 @@ if(NOT APPLE)
       list(APPEND HIP_SRCS cupti.cc)
     endif()
   endif()
+  if(WITH_MUSA)
+	  list(APPEND MUSA_SRCS musartc.cc musa_driver.cc)
+  endif()
 endif()
 
 if(TENSORRT_FOUND)
@@ -93,6 +103,8 @@ if(WITH_ROCM)
   collect_srcs(backends_srcs SRCS ${DYNLOAD_COMMON_SRCS} ${HIP_SRCS})
 elseif(WITH_GPU)
   collect_srcs(backends_srcs SRCS ${DYNLOAD_COMMON_SRCS} ${CUDA_SRCS})
+elseif(WITH_MUSA)
+  collect_srcs(backends_srcs SRCS ${DYNLOAD_COMMON_SRCS} ${MUSA_SRCS})
 else()
   collect_srcs(backends_srcs SRCS ${DYNLOAD_COMMON_SRCS})
 endif()
diff --git a/paddle/phi/backends/gpu/gpu_decls.h b/paddle/phi/backends/gpu/gpu_decls.h
index d6f42ff743e58..4020e811f7aca 100644
--- a/paddle/phi/backends/gpu/gpu_decls.h
+++ b/paddle/phi/backends/gpu/gpu_decls.h
@@ -24,6 +24,7 @@ namespace phi {
   using GPU_TYPE = ROCM_TYPE;
 
 #elif defined(PADDLE_WITH_MUSA)
+  //using mudnnHandle_t = ::musa::dnn::Handle;
   using mudnnHandle_t = bool**;
   using mublasLtHandle_t = bool**;
   using musparseHandle_t = bool**;
diff --git a/paddle/phi/backends/gpu/gpu_primitives.h b/paddle/phi/backends/gpu/gpu_primitives.h
index b7c9f9c4688dc..b891644679264 100644
--- a/paddle/phi/backends/gpu/gpu_primitives.h
+++ b/paddle/phi/backends/gpu/gpu_primitives.h
@@ -61,7 +61,7 @@ CUDA_ATOMIC_WRAPPER(Add, int64_t) {
       static_cast<unsigned long long int>(val));            // NOLINT
 }
 
-#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600)
+#if defined(__HIPCC__) || defined(__MUSACC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600)
 USE_CUDA_ATOMIC(Add, double);
 #else
 CUDA_ATOMIC_WRAPPER(Add, double) {
@@ -231,21 +231,23 @@ __device__ __forceinline__ void fastAtomicAdd(T *arr,
 
 // NOTE(zhangbo): cuda do not have atomicCAS for __nv_bfloat16.
 inline static __device__ uint32_t bf16_add_to_low_half(uint32_t val, float x) {
-  phi::dtype::bfloat16 low_half;
-  // the bfloat16 in lower 16bits
-  low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
-  low_half =
-      static_cast<phi::dtype::bfloat16>(static_cast<float>(low_half) + x);
-  return (val & 0xFFFF0000u) | low_half.x;
+  return 0;
+  //phi::dtype::bfloat16 low_half;
+  //// the bfloat16 in lower 16bits
+  //low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
+  //low_half =
+  //    static_cast<phi::dtype::bfloat16>(static_cast<float>(low_half) + x);
+  //return (val & 0xFFFF0000u) | low_half.x;
 }
 
 inline static __device__ uint32_t bf16_add_to_high_half(uint32_t val, float x) {
-  phi::dtype::bfloat16 high_half;
-  // the bfloat16 in higher 16bits
-  high_half.x = static_cast<uint16_t>(val >> 16);
-  high_half =
-      static_cast<phi::dtype::bfloat16>(static_cast<float>(high_half) + x);
-  return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
+  return 0;
+  //phi::dtype::bfloat16 high_half;
+  //// the bfloat16 in higher 16bits
+  //high_half.x = static_cast<uint16_t>(val >> 16);
+  //high_half =
+  //    static_cast<phi::dtype::bfloat16>(static_cast<float>(high_half) + x);
+  //return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
 }
 
 #if CUDA_VERSION >= 11000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
@@ -264,54 +266,54 @@ CUDA_ATOMIC_WRAPPER(Add, phi::dtype::bfloat16) {
                                     PDBF16ToCUDABF16(val)));
 }
 #else
-CUDA_ATOMIC_WRAPPER(Add, phi::dtype::bfloat16) {
-  // concrete packed bfloat16 value may exsits in lower or higher 16bits
-  // of the 32bits address.
-  uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
-      reinterpret_cast<char *>(address) -
-      (reinterpret_cast<uintptr_t>(address) & 0x02));
-  float val_f = static_cast<float>(val);
-  uint32_t old = *address_as_ui;
-  uint32_t sum;
-  uint32_t newval;
-  uint32_t assumed;
-  if (((uintptr_t)address & 0x02) == 0) {
-    // the bfloat16 value stay at lower 16 bits of the address.
-    do {
-      assumed = old;
-      old = atomicCAS(
-          address_as_ui, assumed, bf16_add_to_low_half(assumed, val_f));
-    } while (old != assumed);
-    phi::dtype::bfloat16 ret;
-    ret.x = old & 0xFFFFu;
-    return ret;
-  } else {
-    // the bfloat16 value stay at higher 16 bits of the address.
-    do {
-      assumed = old;
-      old = atomicCAS(
-          address_as_ui, assumed, bf16_add_to_high_half(assumed, val_f));
-    } while (old != assumed);
-    phi::dtype::bfloat16 ret;
-    ret.x = old >> 16;
-    return ret;
-  }
-}
+//CUDA_ATOMIC_WRAPPER(Add, phi::dtype::bfloat16) {
+//  // concrete packed bfloat16 value may exsits in lower or higher 16bits
+//  // of the 32bits address.
+//  uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
+//      reinterpret_cast<char *>(address) -
+//      (reinterpret_cast<uintptr_t>(address) & 0x02));
+//  float val_f = static_cast<float>(val);
+//  uint32_t old = *address_as_ui;
+//  uint32_t sum;
+//  uint32_t newval;
+//  uint32_t assumed;
+//  if (((uintptr_t)address & 0x02) == 0) {
+//    // the bfloat16 value stay at lower 16 bits of the address.
+//    do {
+//      assumed = old;
+//      old = atomicCAS(
+//          address_as_ui, assumed, bf16_add_to_low_half(assumed, val_f));
+//    } while (old != assumed);
+//    phi::dtype::bfloat16 ret;
+//    ret.x = old & 0xFFFFu;
+//    return ret;
+//  } else {
+//    // the bfloat16 value stay at higher 16 bits of the address.
+//    do {
+//      assumed = old;
+//      old = atomicCAS(
+//          address_as_ui, assumed, bf16_add_to_high_half(assumed, val_f));
+//    } while (old != assumed);
+//    phi::dtype::bfloat16 ret;
+//    ret.x = old >> 16;
+//    return ret;
+//  }
+//}
 #endif
 
-CUDA_ATOMIC_WRAPPER(Add, complex<float>) {
-  float *real = reinterpret_cast<float *>(address);
-  float *imag = real + 1;
-  return complex<float>(CudaAtomicAdd(real, val.real),
-                        CudaAtomicAdd(imag, val.imag));
-}
-
-CUDA_ATOMIC_WRAPPER(Add, complex<double>) {
-  double *real = reinterpret_cast<double *>(address);
-  double *imag = real + 1;
-  return complex<double>(CudaAtomicAdd(real, val.real),
-                         CudaAtomicAdd(imag, val.imag));
-}
+//CUDA_ATOMIC_WRAPPER(Add, complex<float>) {
+//  float *real = reinterpret_cast<float *>(address);
+//  float *imag = real + 1;
+//  return complex<float>(CudaAtomicAdd(real, val.real),
+//                        CudaAtomicAdd(imag, val.imag));
+//}
+//
+//CUDA_ATOMIC_WRAPPER(Add, complex<double>) {
+//  double *real = reinterpret_cast<double *>(address);
+//  double *imag = real + 1;
+//  return complex<double>(CudaAtomicAdd(real, val.real),
+//                         CudaAtomicAdd(imag, val.imag));
+//}
 
 // For atomicMax
 USE_CUDA_ATOMIC(Max, int);
@@ -449,55 +451,57 @@ CUDA_ATOMIC_WRAPPER(Max, phi::dtype::float16) {
 #endif
 
 inline static __device__ uint32_t bf16_max_to_low_half(uint32_t val, float x) {
-  phi::dtype::bfloat16 low_half;
-  // The bfloat16 in lower 16bits
-  low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
-  low_half =
-      static_cast<phi::dtype::bfloat16>(max(static_cast<float>(low_half), x));
-  return (val & 0xFFFF0000u) | low_half.x;
+  return 0;
+  //phi::dtype::bfloat16 low_half;
+  //// The bfloat16 in lower 16bits
+  //low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
+  //low_half =
+  //    static_cast<phi::dtype::bfloat16>(max(static_cast<float>(low_half), x));
+  //return (val & 0xFFFF0000u) | low_half.x;
 }
 
 inline static __device__ uint32_t bf16_max_to_high_half(uint32_t val, float x) {
-  phi::dtype::bfloat16 high_half;
-  // The bfloat16 in higher 16bits
-  high_half.x = static_cast<uint16_t>(val >> 16);
-  high_half =
-      static_cast<phi::dtype::bfloat16>(max(static_cast<float>(high_half), x));
-  return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
-}
-
-CUDA_ATOMIC_WRAPPER(Max, phi::dtype::bfloat16) {
-  if (*address >= val) {
-    return *address;
-  }
-  uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
-      reinterpret_cast<char *>(address) -
-      (reinterpret_cast<uintptr_t>(address) & 0x02));
-  float val_f = static_cast<float>(val);
-  uint32_t old = *address_as_ui;
-  uint32_t assumed;
-  if (((uintptr_t)address & 0x02) == 0) {
-    // The bfloat16 value stay at lower 16 bits of the address.
-    do {
-      assumed = old;
-      old = atomicCAS(
-          address_as_ui, assumed, bf16_max_to_low_half(assumed, val_f));
-    } while (old != assumed);
-    phi::dtype::bfloat16 ret;
-    ret.x = old & 0xFFFFu;
-    return ret;
-  } else {
-    // The bfloat16 value stay at higher 16 bits of the address.
-    do {
-      assumed = old;
-      old = atomicCAS(
-          address_as_ui, assumed, bf16_max_to_high_half(assumed, val_f));
-    } while (old != assumed);
-    phi::dtype::bfloat16 ret;
-    ret.x = old >> 16;
-    return ret;
-  }
-}
+  return 0;
+  //phi::dtype::bfloat16 high_half;
+  //// The bfloat16 in higher 16bits
+  //high_half.x = static_cast<uint16_t>(val >> 16);
+  //high_half =
+  //    static_cast<phi::dtype::bfloat16>(max(static_cast<float>(high_half), x));
+  //return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
+}
+
+//CUDA_ATOMIC_WRAPPER(Max, phi::dtype::bfloat16) {
+//  if (*address >= val) {
+//    return *address;
+//  }
+//  uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
+//      reinterpret_cast<char *>(address) -
+//      (reinterpret_cast<uintptr_t>(address) & 0x02));
+//  float val_f = static_cast<float>(val);
+//  uint32_t old = *address_as_ui;
+//  uint32_t assumed;
+//  if (((uintptr_t)address & 0x02) == 0) {
+//    // The bfloat16 value stay at lower 16 bits of the address.
+//    do {
+//      assumed = old;
+//      old = atomicCAS(
+//          address_as_ui, assumed, bf16_max_to_low_half(assumed, val_f));
+//    } while (old != assumed);
+//    phi::dtype::bfloat16 ret;
+//    ret.x = old & 0xFFFFu;
+//    return ret;
+//  } else {
+//    // The bfloat16 value stay at higher 16 bits of the address.
+//    do {
+//      assumed = old;
+//      old = atomicCAS(
+//          address_as_ui, assumed, bf16_max_to_high_half(assumed, val_f));
+//    } while (old != assumed);
+//    phi::dtype::bfloat16 ret;
+//    ret.x = old >> 16;
+//    return ret;
+//  }
+//}
 
 // For atomicMin
 USE_CUDA_ATOMIC(Min, int);
@@ -635,55 +639,57 @@ CUDA_ATOMIC_WRAPPER(Min, phi::dtype::float16) {
 #endif
 
 inline static __device__ uint32_t bf16_min_to_low_half(uint32_t val, float x) {
-  phi::dtype::bfloat16 low_half;
-  // The bfloat16 in lower 16bits
-  low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
-  low_half =
-      static_cast<phi::dtype::bfloat16>(min(static_cast<float>(low_half), x));
-  return (val & 0xFFFF0000u) | low_half.x;
+  return 0;
+  //phi::dtype::bfloat16 low_half;
+  //// The bfloat16 in lower 16bits
+  //low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
+  //low_half =
+  //    static_cast<phi::dtype::bfloat16>(min(static_cast<float>(low_half), x));
+  //return (val & 0xFFFF0000u) | low_half.x;
 }
 
 inline static __device__ uint32_t bf16_min_to_high_half(uint32_t val, float x) {
-  phi::dtype::bfloat16 high_half;
-  // The bfloat16 in higher 16bits
-  high_half.x = static_cast<uint16_t>(val >> 16);
-  high_half =
-      static_cast<phi::dtype::bfloat16>(min(static_cast<float>(high_half), x));
-  return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
-}
-
-CUDA_ATOMIC_WRAPPER(Min, phi::dtype::bfloat16) {
-  if (*address <= val) {
-    return *address;
-  }
-  uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
-      reinterpret_cast<char *>(address) -
-      (reinterpret_cast<uintptr_t>(address) & 0x02));
-  float val_f = static_cast<float>(val);
-  uint32_t old = *address_as_ui;
-  uint32_t assumed;
-  if (((uintptr_t)address & 0x02) == 0) {
-    // The bfloat16 value stay at lower 16 bits of the address.
-    do {
-      assumed = old;
-      old = atomicCAS(
-          address_as_ui, assumed, bf16_min_to_low_half(assumed, val_f));
-    } while (old != assumed);
-    phi::dtype::bfloat16 ret;
-    ret.x = old & 0xFFFFu;
-    return ret;
-  } else {
-    // The bfloat16 value stay at higher 16 bits of the address.
-    do {
-      assumed = old;
-      old = atomicCAS(
-          address_as_ui, assumed, bf16_min_to_high_half(assumed, val_f));
-    } while (old != assumed);
-    phi::dtype::bfloat16 ret;
-    ret.x = old >> 16;
-    return ret;
-  }
-}
+  return 0;
+  //phi::dtype::bfloat16 high_half;
+  //// The bfloat16 in higher 16bits
+  //high_half.x = static_cast<uint16_t>(val >> 16);
+  //high_half =
+  //    static_cast<phi::dtype::bfloat16>(min(static_cast<float>(high_half), x));
+  //return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
+}
+
+//CUDA_ATOMIC_WRAPPER(Min, phi::dtype::bfloat16) {
+//  if (*address <= val) {
+//    return *address;
+//  }
+//  uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
+//      reinterpret_cast<char *>(address) -
+//      (reinterpret_cast<uintptr_t>(address) & 0x02));
+//  float val_f = static_cast<float>(val);
+//  uint32_t old = *address_as_ui;
+//  uint32_t assumed;
+//  if (((uintptr_t)address & 0x02) == 0) {
+//    // The bfloat16 value stay at lower 16 bits of the address.
+//    do {
+//      assumed = old;
+//      old = atomicCAS(
+//          address_as_ui, assumed, bf16_min_to_low_half(assumed, val_f));
+//    } while (old != assumed);
+//    phi::dtype::bfloat16 ret;
+//    ret.x = old & 0xFFFFu;
+//    return ret;
+//  } else {
+//    // The bfloat16 value stay at higher 16 bits of the address.
+//    do {
+//      assumed = old;
+//      old = atomicCAS(
+//          address_as_ui, assumed, bf16_min_to_high_half(assumed, val_f));
+//    } while (old != assumed);
+//    phi::dtype::bfloat16 ret;
+//    ret.x = old >> 16;
+//    return ret;
+//  }
+//}
 
 #ifdef PADDLE_WITH_CUDA
 /*
diff --git a/paddle/phi/backends/gpu/musa/.musa_info.cc.swp b/paddle/phi/backends/gpu/musa/.musa_info.cc.swp
new file mode 100644
index 0000000000000000000000000000000000000000..6af992f38b98918be6e4adade228cbf08d2c333a
GIT binary patch
literal 4096
zcmYc?2=nw+u+TGP00IF9hP(?0Qg*p}Fr5ryU?|GZF98V?0*036W|rn@=7uR|iTKp&
zXXNLm>K7!Yq~xT==av>HLfQHS8JYS?iOJcic`3#E=>?_wU`Y@WpP84Iua}&R&)8A9
z(GVC70ir^Hm%-S`&;X=VSxHerSSXYzU89OdLtr!nMnhmU1V%$(Gz3ONU^E0qLtr!n
IhDQhh0Gzrb@&Et;

literal 0
HcmV?d00001

diff --git a/paddle/phi/backends/gpu/musa/musa_device_function.h b/paddle/phi/backends/gpu/musa/musa_device_function.h
new file mode 100644
index 0000000000000..f6131fb1e53d6
--- /dev/null
+++ b/paddle/phi/backends/gpu/musa/musa_device_function.h
@@ -0,0 +1,190 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+// NOTE(): support float16 to half in header file.
+#define PADDLE_MUSA_FP16
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/enforce.h"
+
+namespace phi {
+namespace backends {
+namespace gpu {
+
+#define FULL_WARP_MASK 0xFFFFFFFF
+#define CREATE_SHFL_MASK(mask, predicate) \
+  mask = __ballot_sync(FULL_WARP_MASK, (predicate))
+
+#define CUDA_LAUNCH_KERNEL_BASE(dim, ...)  \
+  case (dim): {                            \
+    constexpr auto kPowerOfTwoDim = (dim); \
+    __VA_ARGS__;                           \
+  } break
+
+#define CUDA_LAUNCH_KERNEL_HELPER(...)          \
+  CUDA_LAUNCH_KERNEL_BASE(1024, ##__VA_ARGS__); \
+  CUDA_LAUNCH_KERNEL_BASE(512, ##__VA_ARGS__);  \
+  CUDA_LAUNCH_KERNEL_BASE(256, ##__VA_ARGS__);  \
+  CUDA_LAUNCH_KERNEL_BASE(128, ##__VA_ARGS__);  \
+  CUDA_LAUNCH_KERNEL_BASE(64, ##__VA_ARGS__);   \
+  CUDA_LAUNCH_KERNEL_BASE(32, ##__VA_ARGS__);
+
+template <typename T>
+__forceinline__ __device__ T
+CudaShuffleDownSync(unsigned mask, T val, int delta, int width = warpSize) {
+  return __shfl_down_sync(mask, val, static_cast<unsigned>(delta), width);
+}
+
+template <typename T>
+__forceinline__ __device__ T CudaShuffleXorSync(unsigned mask,
+                                                T val,
+                                                int width = warpSize) {
+  return __shfl_xor_sync(mask, val, width);
+}
+
+template <>
+__forceinline__ __device__ phi::dtype::float16 CudaShuffleDownSync(
+    unsigned mask, phi::dtype::float16 val, int delta, int width) {
+  return phi::dtype::float16(__shfl_down_sync(
+      mask, val.to_half(), static_cast<unsigned>(delta), width));
+}
+
+template <>
+__forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleDownSync(
+    unsigned mask, phi::dtype::bfloat16 val, int delta, int width) {
+#if defined(PADDLE_MUSA_BF16)
+  return phi::dtype::bfloat16(__shfl_down_sync(
+      mask, val.to_mt_bfloat16(), static_cast<unsigned>(delta), width));
+#else
+  PADDLE_ENFORCE(
+      false, "__shfl_down_sync with bfloat16 is not supported on cuda <= 11.");
+#endif
+}
+
+template <>
+__forceinline__ __device__ phi::dtype::complex<float> CudaShuffleDownSync(
+    unsigned mask, phi::dtype::complex<float> val, int delta, int width) {
+  float real = static_cast<float>(__shfl_down_sync(
+      mask, static_cast<float>(val.real), static_cast<unsigned>(delta), width));
+  float imag = static_cast<float>(__shfl_down_sync(
+      mask, static_cast<float>(val.imag), static_cast<unsigned>(delta), width));
+  return phi::dtype::complex<float>(real, imag);
+}
+
+template <>
+__forceinline__ __device__ phi::dtype::complex<double> CudaShuffleDownSync(
+    unsigned mask, phi::dtype::complex<double> val, int delta, int width) {
+  double real =
+      static_cast<double>(__shfl_down_sync(mask,
+                                           static_cast<double>(val.real),
+                                           static_cast<unsigned>(delta),
+                                           width));
+  double imag =
+      static_cast<double>(__shfl_down_sync(mask,
+                                           static_cast<double>(val.imag),
+                                           static_cast<unsigned>(delta),
+                                           width));
+  return phi::dtype::complex<double>(real, imag);
+}
+#if 0
+//template <>
+//__forceinline__ __device__ phi::dtype::float16 CudaShuffleXorSync(
+//    unsigned mask, phi::dtype::float16 val, int width) {
+//  return phi::dtype::float16(__shfl_xor_sync(mask, val.to_half(), width));
+//}
+
+template <>
+__forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleXorSync(
+    unsigned mask, phi::dtype::bfloat16 val, int width) {
+#if defined(PADDLE_MUSA_BF16)
+  return phi::dtype::bfloat16(
+      __shfl_xor_sync(mask, val.to_nv_bfloat16(), width));
+#else
+  PADDLE_ENFORCE(
+      false, "__shfl_xor_sync with bfloat16 is not supported on cuda <= 11.");
+#endif
+}
+
+template <>
+__forceinline__ __device__ phi::dtype::complex<float> CudaShuffleXorSync(
+    unsigned mask, phi::dtype::complex<float> val, int width) {
+  float real = static_cast<float>(
+      __shfl_xor_sync(mask, static_cast<float>(val.real), width));
+  float imag = static_cast<float>(
+      __shfl_xor_sync(mask, static_cast<float>(val.imag), width));
+  return phi::dtype::complex<float>(real, imag);
+}
+
+template <>
+__forceinline__ __device__ phi::dtype::complex<double> CudaShuffleXorSync(
+    unsigned mask, phi::dtype::complex<double> val, int width) {
+  double real = static_cast<double>(
+      __shfl_xor_sync(mask, static_cast<double>(val.real), width));
+  double imag = static_cast<double>(
+      __shfl_xor_sync(mask, static_cast<double>(val.imag), width));
+  return phi::dtype::complex<double>(real, imag);
+}
+
+template <typename T>
+__forceinline__ __device__ T
+CudaShuffleSync(unsigned mask, T val, int src_line, int width = 32) {
+  return __shfl_sync(mask, val, src_line, width);
+}
+
+template <typename T>
+HOSTDEVICE T Infinity() {
+  return INFINITY;
+}
+#endif
+
+template <typename T>
+__device__ T reduceSum(T val, int tid, int len) {
+  // NOTE(zcd): The warp size should be taken from the
+  // parameters of the GPU but not specified as 32 simply.
+  // To make the reduceSum more efficiently,
+  // I use Warp-Level Parallelism and assume the Warp size
+  // is 32 which may be different for different GPU,
+  // but most card's warp size is 32.
+  const int warpSize = 32;
+  __shared__ T shm[warpSize];
+  unsigned mask = 0u;
+  CREATE_SHFL_MASK(mask, tid < len);
+
+  for (int offset = warpSize / 2; offset > 0; offset /= 2)
+    val += phi::backends::gpu::CudaShuffleDownSync(mask, val, offset);
+
+  if (tid < warpSize) shm[tid] = 0;
+  __syncthreads();
+
+  if (tid % warpSize == 0) {
+    shm[tid / warpSize] = val;
+  }
+  __syncthreads();
+
+  CREATE_SHFL_MASK(mask, tid < warpSize);
+
+  if (tid < warpSize) {
+    val = shm[tid];
+    for (int offset = warpSize / 2; offset > 0; offset /= 2)
+      val += phi::backends::gpu::CudaShuffleDownSync(mask, val, offset);
+  }
+  return val;
+}
+}  // namespace gpu
+}  // namespace backends
+}  // namespace phi
+
diff --git a/paddle/phi/backends/gpu/musa/musa_helper.h b/paddle/phi/backends/gpu/musa/musa_helper.h
index e69de29bb2d1d..57135ac49d905 100644
--- a/paddle/phi/backends/gpu/musa/musa_helper.h
+++ b/paddle/phi/backends/gpu/musa/musa_helper.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace phi {
+namespace backends {
+namespace gpu {
+
+#define CUDNN_VERSION_MIN(major, minor, patch) \
+  (0 >= ((major)*1000 + (minor)*100 + (patch)))
+
+#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)                           \
+  int64_t __index__ =                                                       \
+      static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;          \
+  int64_t __stride__ = static_cast<int64_t>(blockDim.x) * gridDim.x;        \
+  for (index_type i = __index__; __index__ < (num);                         \
+       __index__ += __stride__, i = __index__)
+
+}  // namespace gpu
+}  // namespace backends
+}  // namespace phi
+
diff --git a/paddle/phi/common/.float16.h.swp b/paddle/phi/common/.float16.h.swp
new file mode 100644
index 0000000000000000000000000000000000000000..77f57398adbb1a6c5a54b0e5dc7285fccf261999
GIT binary patch
literal 16384
zcmeI3U5q4E6~~K3Se9J`g+yNrmsuRU+1Z}%p4r~rc6PLxnPocc&aU&ZyF^UocGd0f
z!s@D~K6-k91tSj{NCYL(1mlAtMl>-C5`7{NeDK9+6vg0!A2HFy=!+UqK*0aGb!(=&
zr+Yru#Dt_K`LQ!q=bn4+x%ZxX&h4g_KUY1?j^zp(es0h-ckw0j?<ezn9{iG~d9EAK
z<L!P5$B&kdAFUrPm?gfI?0C~{iSaf!O<U-#(C6y+@pjW1H{4dsb;j$q%Y#BG*W5}`
z`Y!cA>VeDfz;)V0v5=>R2S;vUH-2#CGIXR(QV*maNIj5xAoW1%fz$)32T~8D9(b>M
zAZYK?9)j_E)Bx^R|6bAe@4M>fQT6<RzW48__l|nLS1E|~ysh4UPQ4HN-v39vUsLa2
z>wEu}df!#=9c6H=|4sG&arOSseed5;?_;|=S4M(yd-bQrRr`PRy;qOZe^L*m9!Ncq
zdLZ>c>Veb)sRvRIq#j5;ka{5Xz<bgI+|)Fxf%j3pjL-k$`v1l2HSKw@29AJh!Cvsn
z`!(%l@F+M5-n>rJo&#S7kAsK7=fO0{fDeF|4rto5;G5tPum*f^H&_7+pbU<K47dsG
z0at+E>_>ZW2|NM50`3D%a0=WBCc%fm4d8n4`)f7r$KVop8hjf(20B0o7r=RN5*!9M
zf&*YbczvIy{Rlh{9s^$lUjXyq4saBF6x<4adyS@j6?_WZ3a$ii?$xxvfxm)FzyqHK
zlVA_{!`1K?_&RtJJOVs01^#pu#t0U{Dewt!Gx#vL3H<wgn)Wa7DtHnE;5gU^t^)7i
z&f;zG6Yw;c1tZ{f%<XI7XW%Jt510lgfSh+)xd(-9)tM>kmLtrJuAeNFW^_HvxML=s
zo~_Kzo|@CU$4YpoX$(JIc7omJ3CjtD*Y-r98{7}3Li}AQ>A`K&nNd&p*^f(E_3aKl
zV0eahod)D>?g+hJC}lcEC!k;S`b04kwA$G*mQOaED0U~&6mO`hZ5Hm;&9E({M?2hx
zXzw7DvFEGH^ZG(%xq5C+KY6xL+OgeaLS%Q7+jZ5iuFF9b=}{)?mAsJI@Z5a$?2L5X
z0}muTcY0;1vi>+BJ+m@f(JPBH^W^N&QKsvOCv5rzo5<(0S?yqAa@4q_oE#9fyBiLk
zKeMzvJ9n-+Gsi5)#zZpFZQGV%1x$H9BmZS&H^WvxC>+zOqvb^~Y$^@c@dK8e(57_D
z4tk54uFv`?Yf`t2jgQ9$VLmM%i;QvG;y$ap9z0NQ7fO*-^t%*c4iBu3U~(?7&<xAB
zT!)!F;4D~c3)z-?EoQXCG4dB?gbi5&@yMkm{q9817h}&v4w7Qh-RI^Om#SwLXkL!z
z^LdCyUTfyMa`pLQ{n4f^q3dM2?(tP@3h9|>tI$*0Ce3X2@HDB}Jk@duV_jugVofE-
zMypG{9BAb*jU+r&<+j~nq&r>$G)6W1y)(1Eh&8N<8Cq#xXauh3Q*e1OUbR_lcuXe`
z?SM9<dOXV9w(vNlPekjwP{KxV2Mcd6kyO{SaEI&Mp;(_q7%BGz!}s8X$pV+P_-C<t
zqc6U(HurrNGzIemvs|`(J@BlK#cdV{Kfu`0co}c&u5G)kmeYv+0#$w+n*x%CO*&el
zRTCbohmH|oqezFwo0wslv36KPY$$7y%j2z5IaR%5Aydq*8+1R-%&HYM8O@d4eq*e%
zcsdrq4qo4bevX9Ev+J!fZWsb~xb;{PL!!x*M+&$7XyDlMYUKC^U>jS!W(!h6=MG{*
z1E&$=0kZ{86^8xJS@CdfEfB2chK|X-wK2BZw2UT6nDCs-M7@rXC!B!PtoWj5WOK7L
zHj;Au5R(uzp<HmZkIh)MY@5|EJ#etW17T{K!&|~{b3-swMw<H*Pw?GE8A+q~FMSk+
zbBfIR(#qMhXBL;yt6b<5=d*0%cZwq=#P>KJX$O@WwYQ_m=w=ZUO<~wH!5Qh4Oqb3C
zVI$rQZ^8qtomv<OUC%_#aI&Lebeh}}b%nG3e05=>n6SZe3_COhn`&C^aUkPyt~uS`
z)yCdcds^4k*|#yCbzSs*&mAcmbY>3FXN2ai^^w8R8H_j~txv@BqxmEFW}G?^53buC
z`@jpGz-kHUhRtUS#vQj!o+&bJQ*1_1kDW@67@|c=Dib!TA$t>QX#7Le^h%CIU$|9Y
zvh@VMg^SQ4I+lUKjKJ!3%WSJ<1wN*E{fPxpPw-aqnbG0_)<QxG>X7Y8W1@UwlM}60
z3lg5?L_*1BtLsZkb46WG=<AinF2_Nh5R(V{`rp{&3vM7Xp5=>{2-}5hxwi#vcHVI<
zlTlj15VGqCdjYn1uquSZ$dNMIGz_CqTwmk5uJ)kJbrE=3VqsbnQ39MiSoPETh0<it
zajMHixim?sUE<(|Wc9+al6-bI9NG4Gqs0;2453lXlJkH_h@j1AES@|MPGQ0vU&si%
zBq(wTrk=^vN1hpU0wDS^<kXFjHw0_A9pO22PT;6)Sw7`Vly8~T5iQR%E6ePBWpS~x
zuw0#6VrLfF%$bGR>N2t%{B@F57VctqRu^W+n7|nTTaZ#Bc}HCLnAJjdDNJdSX<@0z
zXk%_Rni9CA0L5{KjZ~Icb!mj%URkOxjmc>3*reJM_lA~-z+b~D(xwcS<5a+)R;gFs
z@I-WQS`J(FtN^1MW4>DtR&lIC4QZmEW)QT?<KwHVt2vJ7-4r?3Ym7r1`OzOgg*<s-
zY3^__msgGMazh-(Ys{?$9J)CcOB;V7^-uVk%X4K-i8?M)W1KNT6GN<9uu^ml(MDE_
zm4yQhSuLhDIzJOyq|4OoCH+;AcE<BM#?A>(X1TDDWvKm#YBYirJR0c?Z1k)~Ghi7b
z%ZmA8k)4%av5)@CDq+w>K9yq?oNJ3D=(9!P3$H^nNA>^BxBz$&wIJ31@#o*qq1JyG
z90PBm&VLqkz&&6Nyo<V?>i%cICGZpo!Fg~87zYQy3#j4052)^M0s%~L0p!4ba1GcC
zeu&!r2fzdd$bkdkTCfkijr#oO;9KA!Ks9^;Tmyc$54M1N!7O+KwfM{6yWnA9fZM@G
zz`Ll!Uj@&AM}Pq;;3jY*_#<lZ?}6`t4)`RX`g}il1$Fo@z&Ahx7~l}N8tef-MIHV%
z@C5h__!u|{{(?IEIdBPd!5pZ7X>bDk6aN1-_!amip!s+Kq(57F;G#SxH}W$aMVdy%
zEG{BK&tb^U%4M0fmCL;5XE?Ua=AtDj6{mqD9i{gJ-i|UZ>*tG9jYPHcfDTdV>2b#G
zM%M<o<;Zsn{pGVCAle%}#>Qrt!g0Npj^i^EmCn2F?WjP4Np>d2y+wyUQf{P}PD5Yd
z3;(A+5$$mg^uP3H>}rgbgIg{uSle!RY7#Q}+>u>)(4)dc7oO*O+jTH>g-q^hJfiJe
zwtM+2q*kx*e0-S}?`AJ|VD(*f#Zcm+>b=~~cICtjwpcVQY!$UXqWxcUT~yzev*sW-
z(Q5*CR08)Ab(FAz_SFDkkGJW16Xot^2_(WuGP*kQDAz%iTF66ka+9KdF$fh(gQYB|
zSJ~kq+7fcmcDOJxP@Tt3i|%xb1K*P6rqw{5AniUfP$(fVxQ|GHhE&+3K)M_E9xc}l
zZ5J|II2vN|8UX3wc5;?5jvP5YR7~Es$o;NI`_~WFkjy22KY^LXl}z*~aCJu@-Sp*!
z)aE77<UUGv8M|#4DeW-xqArzW)aOQ1v{G*<<;GeW>?~@~6SmQA;L?+>CF%8myRBp#
zDz_BGG>9%oO*L$k@tk5&iw0_l?c``Qa!!X21AVIdF6+L8sJdcu!j|wXLyt<@sR2j$
zG;VDIfdm~_JZuW}?2xR%^5t0}Wa^-9)zxs+i*2)n-d6DFc8SHH>;<>oXo&m;`G(f3
z(c#YJX{3fqqs(MBFJpXA5GI11@-BQII?){M%(a8pZLB<+<PF_B*UWl;=x&ZuLA6hD
z3)vC6(HUrBxlGd2<+3yai$OXdscyiU8AG&>e`GA6MTick63=OEL~`ZG4!Pz%ZtHTL
za3`43Hijy<KP6jj7jgqF?OUbf%Hcm`P=M;5dnP_tLy<aIryBOw(*j-U?s=eHN(VlB
zN{pxNngfOVHX90bT>r|u*PIc`y4hz2t^{!J>WA{)xE|W}nk=DFRLh$Hr(+nhZkCRW
zJ|>_tTV-iEHZM?W<2qO{y7cfYTw!7iFp1&bB6(##bLdc^K(_?Ar$&rN5J#-QO)qJa
zMp1PSbEvp)dv1p`aR=6fe&Dt!-caLtBq!(xR}bo@o_c}?vJ?MDeV)-p8#4|aibsLV
zczJ8@GZ>KX%1bB9Csndjd`gEqgqn++cI9^|(Kny!vZY?fkdR406Z(&495uf3h5879
cK3QXka4^cX@i8uaREFQ>@L2;Nra9Vw0VR%^C;$Ke

literal 0
HcmV?d00001

diff --git a/paddle/phi/common/bfloat16.h b/paddle/phi/common/bfloat16.h
index 4cc21a14faac8..73714adbf450c 100644
--- a/paddle/phi/common/bfloat16.h
+++ b/paddle/phi/common/bfloat16.h
@@ -26,6 +26,15 @@
 #include <cuda.h>
 #endif
 
+#ifdef PADDLE_WITH_MUSA
+#include <musa.h>
+#endif
+#if defined(__MUSACC__)
+#define PADDLE_MUSA_BF16
+#include <musa_bf16.h>
+#endif
+
+
 #if defined(__CUDACC__) && CUDA_VERSION >= 11000
 #define PADDLE_CUDA_BF16
 #include <cuda_bf16.h>
@@ -63,7 +72,7 @@ struct PADDLE_ALIGN(2) bfloat16 {
     x = res >> 16;
 #elif defined(PADDLE_WITH_MUSA)
 #if defined(PADDLE_MUSA_BF16)
-    __nv_bfloat16 tmp = __float2bfloat16(val);
+    __mt_bfloat16 tmp = __float2bfloat16(val);
     x = *reinterpret_cast<uint16_t*>(&tmp);
 #else
     std::memcpy(&x, reinterpret_cast<char*>(&val) + 2, 2);
@@ -163,7 +172,7 @@ struct PADDLE_ALIGN(2) bfloat16 {
     return res;
 #elif defined(PADDLE_WITH_MUSA)
 #ifdef PADDLE_MUSA_BF16
-    return __bfloat162float(*reinterpret_cast<const __nv_bfloat16*>(&x));
+    return __bfloat162float(*reinterpret_cast<const __mt_bfloat16*>(&x));
 #else
     float val = 0.f;
     uint16_t temp = x;
@@ -190,6 +199,12 @@ struct PADDLE_ALIGN(2) bfloat16 {
   }
 #endif
 
+#ifdef PADDLE_MUSA_BF16
+  HOSTDEVICE inline __mt_bfloat16 to_mt_bfloat16() const {
+    return *reinterpret_cast<const __mt_bfloat16*>(&x);
+  }
+#endif
+
   HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; }
 
   HOSTDEVICE inline explicit operator int8_t() const {
diff --git a/paddle/phi/common/complex.h b/paddle/phi/common/complex.h
index 43e513146ba0a..f4c5be53660aa 100644
--- a/paddle/phi/common/complex.h
+++ b/paddle/phi/common/complex.h
@@ -201,7 +201,7 @@ template <typename T>
 HOSTDEVICE inline complex<T> operator+(const complex<T>& a,
                                        const complex<T>& b) {
 #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__))
   return complex<T>(thrust::complex<T>(a) + thrust::complex<T>(b));
 #else
   return complex<T>(a.real + b.real, a.imag + b.imag);
@@ -212,7 +212,7 @@ template <typename T>
 HOSTDEVICE inline complex<T> operator-(const complex<T>& a,
                                        const complex<T>& b) {
 #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__))
   return complex<T>(thrust::complex<T>(a) - thrust::complex<T>(b));
 #else
   return complex<T>(a.real - b.real, a.imag - b.imag);
@@ -223,7 +223,7 @@ template <typename T>
 HOSTDEVICE inline complex<T> operator*(const complex<T>& a,
                                        const complex<T>& b) {
 #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__))
   return complex<T>(thrust::complex<T>(a) * thrust::complex<T>(b));
 #else
   return complex<T>(a.real * b.real - a.imag * b.imag,
@@ -235,7 +235,7 @@ template <typename T>
 HOSTDEVICE inline complex<T> operator/(const complex<T>& a,
                                        const complex<T>& b) {
 #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__))
   return complex<T>(thrust::complex<T>(a) / thrust::complex<T>(b));
 #else
   T denominator = b.real * b.real + b.imag * b.imag;
@@ -247,7 +247,7 @@ HOSTDEVICE inline complex<T> operator/(const complex<T>& a,
 template <typename T>
 HOSTDEVICE inline complex<T> operator-(const complex<T>& a) {
 #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__))
   return complex<T>(-thrust::complex<T>(a.real, a.imag));
 #else
   complex<T> res;
@@ -261,7 +261,7 @@ template <typename T>
 HOSTDEVICE inline complex<T>& operator+=(complex<T>& a,  // NOLINT
                                          const complex<T>& b) {
 #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__))
   a = complex<T>(thrust::complex<T>(a.real, a.imag) +=
                  thrust::complex<T>(b.real, b.imag));
   return a;
@@ -276,7 +276,7 @@ template <typename T>
 HOSTDEVICE inline complex<T>& operator-=(complex<T>& a,  // NOLINT
                                          const complex<T>& b) {
 #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__))
   a = complex<T>(thrust::complex<T>(a.real, a.imag) -=
                  thrust::complex<T>(b.real, b.imag));
   return a;
@@ -291,7 +291,7 @@ template <typename T>
 HOSTDEVICE inline complex<T>& operator*=(complex<T>& a,  // NOLINT
                                          const complex<T>& b) {
 #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__))
   a = complex<T>(thrust::complex<T>(a.real, a.imag) *=
                  thrust::complex<T>(b.real, b.imag));
   return a;
@@ -306,7 +306,7 @@ template <typename T>
 HOSTDEVICE inline complex<T>& operator/=(complex<T>& a,  // NOLINT
                                          const complex<T>& b) {
 #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__))
   a = complex<T>(thrust::complex<T>(a.real, a.imag) /=
                  thrust::complex<T>(b.real, b.imag));
   return a;
@@ -369,7 +369,7 @@ HOSTDEVICE inline complex<T>(min)(const complex<T>& a, const complex<T>& b) {
 template <typename T>
 HOSTDEVICE inline bool(isnan)(const complex<T>& a) {
 #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__))
   return ::isnan(a.real) || ::isnan(a.imag);
 #else
   return std::isnan(a.real) || std::isnan(a.imag);
@@ -379,7 +379,7 @@ HOSTDEVICE inline bool(isnan)(const complex<T>& a) {
 template <typename T>
 HOSTDEVICE inline bool isinf(const complex<T>& a) {
 #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__))
   return ::isinf(a.real) || ::isinf(a.imag);
 #else
   return std::isinf(a.real) || std::isinf(a.imag);
@@ -389,7 +389,7 @@ HOSTDEVICE inline bool isinf(const complex<T>& a) {
 template <typename T>
 HOSTDEVICE inline bool isfinite(const complex<T>& a) {
 #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__))
   return ::isfinite(a.real) || ::isfinite(a.imag);
 #else
   return std::isfinite(a.real) || std::isfinite(a.imag);
@@ -399,7 +399,7 @@ HOSTDEVICE inline bool isfinite(const complex<T>& a) {
 template <typename T>
 HOSTDEVICE inline T abs(const complex<T>& a) {
 #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__))
   return thrust::abs(thrust::complex<T>(a));
 #else
   return std::abs(std::complex<T>(a));
@@ -409,7 +409,7 @@ HOSTDEVICE inline T abs(const complex<T>& a) {
 template <typename T>
 HOSTDEVICE inline T arg(const complex<T>& a) {
 #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__))
   return thrust::arg(thrust::complex<T>(a));
 #else
   return std::arg(std::complex<T>(a));
@@ -419,7 +419,7 @@ HOSTDEVICE inline T arg(const complex<T>& a) {
 template <typename T>
 HOSTDEVICE inline complex<T> pow(const complex<T>& a, const complex<T>& b) {
 #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__))
   return complex<T>(thrust::pow(thrust::complex<T>(a), thrust::complex<T>(b)));
 #else
   return complex<T>(std::pow(std::complex<T>(a), std::complex<T>(b)));
@@ -429,7 +429,7 @@ HOSTDEVICE inline complex<T> pow(const complex<T>& a, const complex<T>& b) {
 template <typename T>
 HOSTDEVICE inline complex<T> sqrt(const complex<T>& a) {
 #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__))
   return complex<T>(thrust::sqrt(thrust::complex<T>(a)));
 #else
   return complex<T>(std::sqrt(std::complex<T>(a)));
@@ -439,7 +439,7 @@ HOSTDEVICE inline complex<T> sqrt(const complex<T>& a) {
 template <typename T>
 HOSTDEVICE inline complex<T> tanh(const complex<T>& a) {
 #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__))
   return complex<T>(thrust::tanh(thrust::complex<T>(a)));
 #else
   return complex<T>(std::tanh(std::complex<T>(a)));
@@ -449,7 +449,7 @@ HOSTDEVICE inline complex<T> tanh(const complex<T>& a) {
 template <typename T>
 HOSTDEVICE inline complex<T> log(const complex<T>& a) {
 #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__))
   return complex<T>(thrust::log(thrust::complex<T>(a)));
 #else
   return complex<T>(std::log(std::complex<T>(a)));
diff --git a/paddle/phi/common/cpstring_impl.h b/paddle/phi/common/cpstring_impl.h
index 6783799026d44..cbbd632aa2484 100644
--- a/paddle/phi/common/cpstring_impl.h
+++ b/paddle/phi/common/cpstring_impl.h
@@ -26,7 +26,7 @@ limitations under the License. */
 
 #include "paddle/phi/core/macros.h"
 
-#if (defined(__NVCC__) || defined(__HIPCC__))
+#if (defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__))
 #define HOSTDEVICE __host__ __device__
 #define DEVICE __device__
 #define HOST __host__
@@ -77,7 +77,7 @@ HOSTDEVICE static inline uint32_t swap32(uint32_t host_int) {
 }
 #endif
 
-#if PD_PSTRING_LITTLE_ENDIAN || (defined(__NVCC__) || defined(__HIPCC__))
+#if PD_PSTRING_LITTLE_ENDIAN || (defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__))
 #define PD_le32toh(x) x
 #else  // PD_PSTRING_LITTLE_ENDIAN
 #define PD_le32toh(x) swap32(x)
@@ -209,7 +209,7 @@ HOSTDEVICE static inline void *PD_Malloc(size_t size) { return malloc(size); }
 HOSTDEVICE static inline void *PD_Realloc(void *ptr,
                                           size_t old_size UNUSED,
                                           size_t new_size) {
-#if (defined(__NVCC__) || defined(__HIPCC__))
+#if (defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__))
   if (old_size >= new_size) {
     return ptr;
   }
diff --git a/paddle/phi/common/float16.h b/paddle/phi/common/float16.h
index 572f460197f08..00de1bf605157 100644
--- a/paddle/phi/common/float16.h
+++ b/paddle/phi/common/float16.h
@@ -50,6 +50,11 @@
 #include <cuda_fp16.h>
 #endif
 
+#if defined(__MUSACC__)
+#define PADDLE_CUDA_FP16
+#include <musa_fp16.h>
+#endif
+
 #ifdef __HIPCC__
 #define PADDLE_CUDA_FP16
 #include <hip/hip_fp16.h>
@@ -87,7 +92,7 @@ struct PADDLE_ALIGN(2) float16 {
 #ifdef PADDLE_CUDA_FP16
   HOSTDEVICE inline explicit float16(const half& h) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
-#if defined(PADDLE_WITH_HIP) || CUDA_VERSION >= 9000
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || CUDA_VERSION >= 9000
     x = reinterpret_cast<__half_raw*>(const_cast<half*>(&h))->x;
 #else
     x = h.x;
@@ -106,7 +111,7 @@ struct PADDLE_ALIGN(2) float16 {
 
   HOSTDEVICE inline explicit float16(float val) {
 #if defined(PADDLE_CUDA_FP16) && \
-    (defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300))
+    (defined(__HIPCC__) || defined(__MUSACC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300))
     half tmp = __float2half(val);
     x = *reinterpret_cast<uint16_t*>(&tmp);
 
@@ -148,7 +153,7 @@ struct PADDLE_ALIGN(2) float16 {
 // Assignment operators
 #ifdef PADDLE_CUDA_FP16
   HOSTDEVICE inline float16& operator=(const half& rhs) {
-#if defined(PADDLE_WITH_HIP) || CUDA_VERSION >= 9000
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || CUDA_VERSION >= 9000
     x = reinterpret_cast<__half_raw*>(const_cast<half*>(&rhs))->x;
 #else
     x = rhs.x;
@@ -222,7 +227,7 @@ struct PADDLE_ALIGN(2) float16 {
 // Conversion operators
 #ifdef PADDLE_CUDA_FP16
   HOSTDEVICE inline half to_half() const {
-#if defined(PADDLE_WITH_HIP) || CUDA_VERSION >= 9000
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || CUDA_VERSION >= 9000
     __half_raw h;
     h.x = x;
     return half(h);
@@ -242,7 +247,7 @@ struct PADDLE_ALIGN(2) float16 {
 
   HOSTDEVICE inline operator float() const {
 #if defined(PADDLE_CUDA_FP16) && \
-    (defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300))
+    (defined(__HIPCC__) || defined(__MUSACC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300))
     half tmp = *reinterpret_cast<const half*>(this);
     return __half2float(tmp);
 
@@ -399,7 +404,7 @@ DEVICE inline half operator-(const half& a) {
 #endif
 }
 
-#ifndef PADDLE_WITH_HIP  // not defined __HIP_NO_HALF_OPERATORS__
+#ifdef PADDLE_WITH_CUDA  // not defined __HIP_NO_HALF_OPERATORS__
 DEVICE inline half& operator+=(half& a, const half& b) {  // NOLINT
   a = a + b;
   return a;
@@ -1014,13 +1019,13 @@ struct is_pod<phi::dtype::float16> {
                             is_standard_layout<phi::dtype::float16>::value;
 };
 
-template <>
-struct is_floating_point<phi::dtype::float16>
-    : std::integral_constant<
-          bool,
-          std::is_same<
-              phi::dtype::float16,
-              typename std::remove_cv<phi::dtype::float16>::type>::value> {};
+//template <>
+//struct is_floating_point<phi::dtype::float16>
+//    : std::integral_constant<
+//          bool,
+//          std::is_same<
+//              phi::dtype::float16,
+//              typename std::remove_cv<phi::dtype::float16>::type>::value> {};
 template <>
 struct is_signed<phi::dtype::float16> {
   static const bool value = true;
diff --git a/paddle/phi/common/scalar.h b/paddle/phi/common/scalar.h
index 4286dfcc1d0fa..c8ced345a637a 100644
--- a/paddle/phi/common/scalar.h
+++ b/paddle/phi/common/scalar.h
@@ -140,10 +140,10 @@ class ScalarBase {
         return static_cast<RT>(data_.f32);
       case DataType::FLOAT64:
         return static_cast<RT>(data_.f64);
-      case DataType::FLOAT16:
-        return static_cast<RT>(data_.f16);
-      case DataType::BFLOAT16:
-        return static_cast<RT>(data_.bf16);
+      //case DataType::FLOAT16:
+      //  return static_cast<RT>(data_.f16);
+      //case DataType::BFLOAT16:
+      //  return static_cast<RT>(data_.bf16);
       case DataType::INT32:
         return static_cast<RT>(data_.i32);
       case DataType::INT64:
@@ -162,10 +162,10 @@ class ScalarBase {
         return static_cast<RT>(data_.ui8);
       case DataType::BOOL:
         return static_cast<RT>(data_.b);
-      case DataType::COMPLEX64:
-        return static_cast<RT>(data_.c64);
-      case DataType::COMPLEX128:
-        return static_cast<RT>(data_.c128);
+      //case DataType::COMPLEX64:
+      //  return static_cast<RT>(data_.c64);
+      //case DataType::COMPLEX128:
+      //  return static_cast<RT>(data_.c128);
       default:
         PD_THROW("Invalid enum scalar data type `", dtype_, "`.");
     }
diff --git a/paddle/phi/common/transform.h b/paddle/phi/common/transform.h
index e80561284b885..620d3d683fbf0 100644
--- a/paddle/phi/common/transform.h
+++ b/paddle/phi/common/transform.h
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/hostdevice.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #include <thrust/execution_policy.h>
 #include <thrust/transform.h>
 #include "thrust/device_ptr.h"
@@ -92,7 +92,7 @@ struct Transform<phi::CPUContext> {
   }
 };
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 
 // PointerToThrustDevicePtr has two specializations, one casts a (CUDA
 // device) pointer into thrust::device_ptr, the other keeps rest types
@@ -153,6 +153,12 @@ struct Transform<phi::GPUContext> {
                       CastToCUDATransformIterator(last),
                       CastToCUDATransformIterator(result),
                       op);
+#elif defined(__MUSACC__)
+    thrust::transform(thrust::musa::par.on(context.stream()),
+                      CastToCUDATransformIterator(first),
+                      CastToCUDATransformIterator(last),
+                      CastToCUDATransformIterator(result),
+                      op);
 #else
     thrust::transform(thrust::cuda::par.on(context.stream()),
                       CastToCUDATransformIterator(first),
@@ -184,6 +190,13 @@ struct Transform<phi::GPUContext> {
                       CastToCUDATransformIterator(first2),
                       CastToCUDATransformIterator(result),
                       op);
+#elif defined(__MUSACC__)
+    thrust::transform(thrust::musa::par.on(context.stream()),
+                      CastToCUDATransformIterator(first1),
+                      CastToCUDATransformIterator(last1),
+                      CastToCUDATransformIterator(first2),
+                      CastToCUDATransformIterator(result),
+                      op);
 #else
     thrust::transform(thrust::cuda::par.on(context.stream()),
                       CastToCUDATransformIterator(first1),
diff --git a/paddle/phi/core/enforce.h b/paddle/phi/core/enforce.h
index aaa3eebfe27a5..f07a763ac52d0 100644
--- a/paddle/phi/core/enforce.h
+++ b/paddle/phi/core/enforce.h
@@ -414,6 +414,17 @@ struct EnforceNotMet : public std::exception {
       abort();                                                     \
     }                                                              \
   } while (0)
+#elif defined(__MUSACC__)
+#define PADDLE_ENFORCE(_IS_NOT_ERROR, __FORMAT, ...)               \
+  do {                                                             \
+    if (!(_IS_NOT_ERROR)) {                                        \
+      printf("Error: %s:%d Assertion `%s` failed. " __FORMAT "\n", \
+             __FILE__,                                             \
+             __LINE__,                                             \
+             #_IS_NOT_ERROR,                                       \
+             ##__VA_ARGS__);                                       \
+    }                                                              \
+  } while (0)
 #else
 #define PADDLE_ENFORCE(COND, ...)                               \
   do {                                                          \
diff --git a/paddle/phi/core/hostdevice.h b/paddle/phi/core/hostdevice.h
index decebbe66a538..81e663fa20df6 100644
--- a/paddle/phi/core/hostdevice.h
+++ b/paddle/phi/core/hostdevice.h
@@ -18,6 +18,10 @@
 #include <hip/hip_runtime.h>
 #endif
 
+#ifdef __MUSACC__
+#include <musa_runtime.h>
+#endif
+
 #if defined(__xpu__)
 #include <xpu/runtime.h>
 
@@ -26,7 +30,7 @@
 #include "xpu/kernel/math.h"
 #endif
 
-#if (defined(__CUDACC__) || defined(__HIPCC__) || defined(__xpu__))
+#if (defined(__CUDACC__) || defined(__HIPCC__) || defined(__MUSACC__) || defined(__xpu__))
 #define HOSTDEVICE __host__ __device__
 #define DEVICE __device__
 #define HOST __host__
diff --git a/paddle/phi/core/macros.h b/paddle/phi/core/macros.h
index 2e78357492734..f3dae52b04387 100644
--- a/paddle/phi/core/macros.h
+++ b/paddle/phi/core/macros.h
@@ -53,7 +53,7 @@ namespace phi {
 #define PD_CONCATENATE2(arg1, arg2) arg1##arg2
 #define PD_EXPAND(x) x
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #define PADDLE_RESTRICT __restrict__
 #else
 #define PADDLE_RESTRICT
diff --git a/paddle/phi/core/visit_type.h b/paddle/phi/core/visit_type.h
index f96fdb1f28b63..d72046a82e0cb 100644
--- a/paddle/phi/core/visit_type.h
+++ b/paddle/phi/core/visit_type.h
@@ -281,17 +281,9 @@ namespace phi {
       PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::INT16, int16_t, __VA_ARGS__) \
       PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::INT32, int32_t, __VA_ARGS__) \
       PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::INT64, int64_t, __VA_ARGS__) \
-      PD_PRIVATE_CASE_TYPE(                                                    \
-          NAME, ::phi::DataType::BFLOAT16, phi::bfloat16, __VA_ARGS__)         \
-      PD_PRIVATE_CASE_TYPE(                                                    \
-          NAME, ::phi::DataType::FLOAT16, phi::float16, __VA_ARGS__)           \
       PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::FLOAT32, float, __VA_ARGS__) \
       PD_PRIVATE_CASE_TYPE(                                                    \
           NAME, ::phi::DataType::FLOAT64, double, __VA_ARGS__)                 \
-      PD_PRIVATE_CASE_TYPE(                                                    \
-          NAME, ::phi::DataType::COMPLEX64, phi::complex64, __VA_ARGS__)       \
-      PD_PRIVATE_CASE_TYPE(                                                    \
-          NAME, ::phi::DataType::COMPLEX128, phi::complex128, __VA_ARGS__)     \
       default:                                                                 \
         PADDLE_THROW(phi::errors::InvalidArgument(                             \
             "Invalid enum data type `%d`.", static_cast<int>(__dtype__)));     \
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index 4b21e61f8d88c..e57ac9d80f6c1 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -26,19 +26,23 @@ file(GLOB kernel_impl_h "impl/*.h" "selected_rows/impl/*.h")
 file(GLOB kernel_primitive_h "primitive/*.h")
 
 # fusion ops would be included here
+#file(
+#  GLOB kernel_cu
+#  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+#  "gpu/*.cu"
+#  "gpu/*.cu.cc"
+#  "gpudnn/*.cu"
+#  "kps/*.cu"
+#  "legacy/kps/*.cu"
+#  "legacy/gpu/*.cu"
+#  "selected_rows/gpu/*.cu"
+#  "sparse/gpu/*.cu"
+#  "strings/gpu/*.cu"
+#  "fusion/gpu/*.cu")
 file(
   GLOB kernel_cu
   RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
-  "gpu/*.cu"
-  "gpu/*.cu.cc"
-  "gpudnn/*.cu"
-  "kps/*.cu"
-  "legacy/kps/*.cu"
-  "legacy/gpu/*.cu"
-  "selected_rows/gpu/*.cu"
-  "sparse/gpu/*.cu"
-  "strings/gpu/*.cu"
-  "fusion/gpu/*.cu")
+  "gpu/a*.cu")
 
 if(APPLE OR WIN32)
   list(REMOVE_ITEM kernel_cu "fusion/gpu/fusion_group_kernel.cu")
diff --git a/paddle/phi/kernels/activation_kernel.cc b/paddle/phi/kernels/activation_kernel.cc
index 9626621ae8657..0b324d584e4d4 100644
--- a/paddle/phi/kernels/activation_kernel.cc
+++ b/paddle/phi/kernels/activation_kernel.cc
@@ -32,7 +32,7 @@ using complex128 = ::phi::dtype::complex<double>;
 
 PD_REGISTER_KERNEL(relu6, CPU, ALL_LAYOUT, phi::Relu6Kernel, float, double) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 PD_REGISTER_KERNEL(relu6,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/assign_kernel.cc b/paddle/phi/kernels/assign_kernel.cc
index c44b6333154cc..425ce19808ea4 100644
--- a/paddle/phi/kernels/assign_kernel.cc
+++ b/paddle/phi/kernels/assign_kernel.cc
@@ -135,7 +135,7 @@ PD_REGISTER_KERNEL(assign_value,
                    int8_t,
                    int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 PD_REGISTER_KERNEL_FOR_ALL_DTYPE(assign,
                                  GPU,
                                  ALL_LAYOUT,
diff --git a/paddle/phi/kernels/batch_norm_kernel.cc b/paddle/phi/kernels/batch_norm_kernel.cc
index dba08b0de366a..3f9050af76d8a 100644
--- a/paddle/phi/kernels/batch_norm_kernel.cc
+++ b/paddle/phi/kernels/batch_norm_kernel.cc
@@ -97,7 +97,7 @@ PD_REGISTER_KERNEL(batch_norm_infer,
 }
 #endif
 #endif
-#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 PD_REGISTER_KERNEL(batch_norm_infer,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/check_memory_continue_kernel.cc b/paddle/phi/kernels/check_memory_continue_kernel.cc
index 9f4b51281cd37..87dcd2eaa01ac 100644
--- a/paddle/phi/kernels/check_memory_continue_kernel.cc
+++ b/paddle/phi/kernels/check_memory_continue_kernel.cc
@@ -88,7 +88,7 @@ PD_REGISTER_KERNEL(check_memory_continue,
                    float,
                    double) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 PD_REGISTER_KERNEL(check_memory_continue,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/coalesce_tensor_kernel.cc b/paddle/phi/kernels/coalesce_tensor_kernel.cc
index 58cacd21bba18..8a694bec4a9b8 100644
--- a/paddle/phi/kernels/coalesce_tensor_kernel.cc
+++ b/paddle/phi/kernels/coalesce_tensor_kernel.cc
@@ -292,7 +292,7 @@ PD_REGISTER_KERNEL(coalesce_tensor,
 }
 #endif
 
-#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 PD_REGISTER_KERNEL(coalesce_tensor,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/cpu/activation_grad_kernel.cc b/paddle/phi/kernels/cpu/activation_grad_kernel.cc
index 7b9074ffa92f3..d47c98608c91f 100644
--- a/paddle/phi/kernels/cpu/activation_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/activation_grad_kernel.cc
@@ -257,7 +257,7 @@ PD_REGISTER_KERNEL(
 
 #define PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(name, func) \
   PD_REGISTER_KERNEL(                                         \
-      name, CPU, ALL_LAYOUT, phi::func, float, double, phi::dtype::float16) {}
+      name, CPU, ALL_LAYOUT, phi::func, float, double) {}
 
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(sin_grad, SinGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(cos_grad, CosGradKernel)
diff --git a/paddle/phi/kernels/cpu/activation_kernel.cc b/paddle/phi/kernels/cpu/activation_kernel.cc
index 046cee5857808..62ae48766057c 100644
--- a/paddle/phi/kernels/cpu/activation_kernel.cc
+++ b/paddle/phi/kernels/cpu/activation_kernel.cc
@@ -191,8 +191,8 @@ PD_REGISTER_KERNEL(exp,
                    float,
                    double,
                    int,
-                   int64_t,
-                   phi::dtype::float16) {}
+                   int64_t) {}
+                   //phi::dtype::float16) {}
 
 PD_REGISTER_KERNEL(expm1,
                    CPU,
@@ -201,8 +201,8 @@ PD_REGISTER_KERNEL(expm1,
                    float,
                    double,
                    int,
-                   int64_t,
-                   phi::dtype::float16) {}
+                   int64_t) {}
+                   //phi::dtype::float16) {}
 
 PD_REGISTER_KERNEL(logit, CPU, ALL_LAYOUT, phi::LogitKernel, float, double) {}
 PD_REGISTER_KERNEL(
@@ -220,9 +220,9 @@ PD_REGISTER_KERNEL(log,
                    float,
                    double,
                    int,
-                   int64_t,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16) {}
+                   int64_t) {}
+                   //phi::dtype::float16,
+                   //phi::dtype::bfloat16) {}
 PD_REGISTER_KERNEL(log2,
                    CPU,
                    ALL_LAYOUT,
@@ -230,9 +230,9 @@ PD_REGISTER_KERNEL(log2,
                    float,
                    double,
                    int,
-                   int64_t,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16) {}
+                   int64_t) {}
+                   //phi::dtype::float16,
+                   //phi::dtype::bfloat16) {}
 PD_REGISTER_KERNEL(log10,
                    CPU,
                    ALL_LAYOUT,
@@ -240,9 +240,9 @@ PD_REGISTER_KERNEL(log10,
                    float,
                    double,
                    int,
-                   int64_t,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16) {}
+                   int64_t) {}
+                   //phi::dtype::float16,
+                   //phi::dtype::bfloat16) {}
 PD_REGISTER_KERNEL(log1p,
                    CPU,
                    ALL_LAYOUT,
@@ -250,9 +250,9 @@ PD_REGISTER_KERNEL(log1p,
                    float,
                    double,
                    int,
-                   int64_t,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16) {}
+                   int64_t) {}
+                   //phi::dtype::float16,
+                   //phi::dtype::bfloat16) {}
 
 PD_REGISTER_ACTIVATION_KERNEL(hardswish, HardSwishKernel)
 PD_REGISTER_ACTIVATION_KERNEL(round, RoundKernel)
diff --git a/paddle/phi/kernels/cpu/cast_grad_kernel.cc b/paddle/phi/kernels/cpu/cast_grad_kernel.cc
index 403caf997dbf7..fad74ef9e7ce9 100644
--- a/paddle/phi/kernels/cpu/cast_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/cast_grad_kernel.cc
@@ -25,9 +25,9 @@ void CastGradKernel(const Context& dev_ctx,
                     const DenseTensor& x,
                     const DenseTensor& out_grad,
                     DenseTensor* x_grad) {
-  PD_VISIT_ALL_TYPES(x.dtype(), "CastKernelImpl", ([&] {
-                       CastKernelImpl<T, data_t>(dev_ctx, out_grad, x_grad);
-                     }));
+  //PD_VISIT_ALL_TYPES(x.dtype(), "CastKernelImpl", ([&] {
+  //                     CastKernelImpl<T, data_t>(dev_ctx, out_grad, x_grad);
+  //                   }));
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/dist_grad_kernel.cc b/paddle/phi/kernels/dist_grad_kernel.cc
index 442290c3648e2..658135c36fd72 100644
--- a/paddle/phi/kernels/dist_grad_kernel.cc
+++ b/paddle/phi/kernels/dist_grad_kernel.cc
@@ -97,7 +97,7 @@ void DistGradKernel(const Context& dev_ctx,
 PD_REGISTER_KERNEL(
     dist_grad, CPU, ALL_LAYOUT, phi::DistGradKernel, float, double) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 PD_REGISTER_KERNEL(
     dist_grad, GPU, ALL_LAYOUT, phi::DistGradKernel, float, double) {}
 #endif
diff --git a/paddle/phi/kernels/empty_kernel.cc b/paddle/phi/kernels/empty_kernel.cc
index 54449200ae4b2..76377d201e274 100644
--- a/paddle/phi/kernels/empty_kernel.cc
+++ b/paddle/phi/kernels/empty_kernel.cc
@@ -74,7 +74,7 @@ PD_REGISTER_KERNEL(empty_like,
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 PD_REGISTER_KERNEL(empty,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/flatten_grad_kernel.cc b/paddle/phi/kernels/flatten_grad_kernel.cc
index 3ecef871d211d..595f38e03910f 100644
--- a/paddle/phi/kernels/flatten_grad_kernel.cc
+++ b/paddle/phi/kernels/flatten_grad_kernel.cc
@@ -46,7 +46,7 @@ PD_REGISTER_KERNEL(flatten_grad,
                    int,
                    int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 PD_REGISTER_KERNEL(flatten_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/flatten_kernel.cc b/paddle/phi/kernels/flatten_kernel.cc
index 6b22ac7518179..b7b41782ba092 100644
--- a/paddle/phi/kernels/flatten_kernel.cc
+++ b/paddle/phi/kernels/flatten_kernel.cc
@@ -75,7 +75,7 @@ PD_REGISTER_KERNEL(flatten,
                    int,
                    int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 PD_REGISTER_KERNEL(flatten_infer,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/full_kernel.cc b/paddle/phi/kernels/full_kernel.cc
index 982b6a396c2a8..e709be621c8d6 100644
--- a/paddle/phi/kernels/full_kernel.cc
+++ b/paddle/phi/kernels/full_kernel.cc
@@ -49,7 +49,7 @@ PD_REGISTER_KERNEL(full_batch_size_like,
                    bool) {
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
 }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 PD_REGISTER_KERNEL(full_batch_size_like,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/funcs/.im2col.cu.swp b/paddle/phi/kernels/funcs/.im2col.cu.swp
new file mode 100644
index 0000000000000000000000000000000000000000..530e1d87fe3d7b003699505266a4bf81f84ee1b8
GIT binary patch
literal 16384
zcmeHNZHya786HY$DYQwHwn+S-nqEa@=N$X)66M2p=WyI5sYQBM<t_<PNUS~YoIQ8@
z;qI>QqiKo?5~xCai2kUMP=J;eeu0n(0`VyXLPG5iB!tQzghWAzKU72r)RvFuo!Rl)
zzVq2pf(T+)dc2;Q_iNsHXLjb@T^u=6euN*+k1|}l7%Tkc`}UO&<!(FidBy_YkEpue
zHG1UG!6S#34vpFet?g(Fb>9_*hGpB1sJPA0ivJfHby-*ufhU}>u+;RbVL`fMRo}^1
zo7>4t+fomt9=IM4+{5;djgAn<LqiYpdv~9{9_Exb^+4)@)B~vpQV*maNIj5xAoW1%
zf&Xz2M2$PyhtY@I;(lC=?{De4|0}-CiJ{Wf{?EAmd|dy{uJ(V#?Q?Pc&RBub^Jd(B
zHg3Pz)&548{HtB<ugC4jV*Xumhm8EVI=!VHNIj5xAoW1%fz$)32T~8D9!NcqdLZ>c
z>Vf}D4_G#1yV3I(>7b1F|K|Mv!<~%%0C*CZ1nvj!1Af1Qu~&gF0vCb%fnVIi*w=xt
z0iOdt4txwa4Lkt+>2Aip3w#RrBybs60hWQYKpwaY_}x396L=PA0LOv%0N38Z*w2A4
z08L;P$N;wkFW<%3Gr$}$4qSUXW4{Kz0z3)457+_x>Q2T!0jvVUz#r~l>;=FBjsS;&
z*WSk1kAY_Z2^<B6fj4ev>?-hO;2baqj01muD`T$!SAnO2r+`bq8qfqH;4xqlH~_qg
zoWyT|Zv!E)3>1O8fmh&v8iN<MOjz(uSJn5zh=-9~EUt=b<OdVdi%NS?6G&U+*d_+v
zi^G<V6-nRg&4bS~ip3e>g<=6eez1p2m#9~LX>-3J0t@;x*^JgOBUWWqO#5CW)}uY7
zmi~ag(Oh(7waBWD6^6X*j!pZ{aeVDT#w!)-(~n}2s{O#VqGEAn`uJRqcSy|fg%ZES
zBH=b*x8M_6To;Y5TqrTVw*bH!=cU(ZMwM#a^5{F09AA=7B!Ws^$XY#W71yL4)pOi$
zMnr8I<DzQ3K=SZ$3ZJAu(&dwzWouoATbE+y!d^@OIFDd{lRM|O6<HM$KRiiVViUB=
zijpy|IEmX_Ma?DM^U|$Id;P*@lB0YfGm>S7gYKE2Y_IduB>G^qJn;lql@ok~<9{_F
zP?}qHXisGAZJ7*@#Fy@!E@+$JCasWsraQuim4+<2v14$kL61{KdP!d{a}1oo;2SNR
zM4`VVJ7G|xVtO>JI;2!|YE8t-C7w}TaI6gsG=FfC&tMQvsuE&w_TjVB**2}1XtCZl
z8K_g<n&fM+868-L8D?r#);cMbk0*IuQ_6PAx}jY11D+vO%jn(6IBpXrWOWqq;o)Ut
zR*P6_Vwn)@ARGs^G}Nu}LWs5eMm<+?op`QZK_$74;!p>`FuI4zai+$A%v0T^UTXn*
z{fVjfar)RxdbRezB_`E#azF=@HkBt_vF$BgW{imov!fLvFtKAcC*>X!CJCVKe?^3%
zE!O#ayFxN+#s%}HU$F3_v5-QW=k$NwqfH$`#3Xa4!=xew0U1>2G6)r|VadYWWUXj3
zwL{d=L{tyXmR)*a5FP!xsyI5dP@$*M#!xjZz3ntZ&@xl0(b1;Ieh|ZGUTaAc+-tfN
zYoVC}7{x`$udZN$%daC`YIthSNH@QMncHgEpx9R>YFbBAR1bt@D<+Bu{lFu;v>qBm
zD#ei*s&yReYv<zDKUlTI_Qw^%-E<)y_;|9kML`mw7=rdIelEAxH2t*Dt6k!db`JG_
z0Sj3CFxqNwplDQY&FavXO540GS;vAg5aj?u+ZUy6WyXX9h>X`870BC)>{yt#rA(d{
zgOSy@B{L7%enbQG)kI#7RVp>dU$mS`MQ{G}XS(sm&%_ID4!gH!*oiUEaz)s%s)DPH
z9-gR44fcO1vxlTtb(*%|L;5S>-d_gShqflJ`mXDH1<P$zqK$^AM1duvFyD)O(W<Tp
z&khT<Mw3u!1k#m}Tou>B<5<|m)-l<=C0QF>RlGAMhNLeOdN*HctFfG#AIPZg!iNr7
zr?%@MZ{oQSU`Z1EBsSdGo#$z@J1<0g9jC1U(S!|Al}ob99m}gVt(xF9e-+<c<LS(?
zg?D8b;t&vW%d^?p^1{P&rx*CysZ*z>W*5rG=K0(yK0P-(Q(h>~&7yRiPtBg=A1Kex
z<hX!sFhs030)dS^^hxZ^rLgnNmNcUln~|_Lo6BX09a2Kzdr;1YrsjEheu%$+YQ8+5
zW2Zfbga+cVrVJouaf4e8NUvIpjzC}5NS{>;1ll4WOD$i6uh6L+5B;TR%?gCxtJI^Y
zQ7jbJ*4FYCg0L>~eo!kw5P2>voGecto1H(lZ!C|`W6$|b?phn%UyLkBw6M@OP+ID6
ztQ9eA7Zsi7N4$xhX;ep-FyRH9Q56k(EP$1qVR;DqH8`qWM4rV1*Y2DtWt<t(96uuh
z8W;}4c!maYC}tYM^OF_!@@d!>&>04Mxhx+W85`q|s!#0oFP~~gbw3F6e9Ce7DIyH{
zDG`cbl}3op|2uIuJ&N-zo&S%;S&)};*8c|Z9Pnvi8u&ZT`hNp{1N;>DEN}_1fxW=J
zz?(Sl{|NXh@EPEvzzQ%AJOZ2q?gM^;^Zpg!46qA$3FrK$fhpj1oa?^@1i(Jvk2u#q
z4^)760lR^nzz*P-INQGnJO!Kp-VI#C`Tjcqo%cToJOI3g^ZoaLj{plm0r)G<_CEu@
z1k`{dz#YI#@WmBi8IZs{K>jHK>6LmQ^+3x5mpLyKc*k>`b=Rq<y<UVj?J-S-%KzB(
z#4Z*WmmJ@UMh}*HZfG@^;yLl?>1mT;YHd1n_Gj96#>#HL4Ab27@t#|xHa)Rzx0q?R
zkDQvgRUD<9bE_EhpNOcza+d#<h`L#(-JFQJS=QW?h`J7A*kxLC+LM=Nfg12G-F+oG
zG*2v)GTB}UjzMI#QXhR5r|o2sj6TP^?bvl&d>}tRdf^5vF+xUeM2J3|-<Tn>j2p6J
zpre0(#}1Yl=r5=5L4P0-HykH0!=$}*pdk9jq>IH6b75j)f--D5oPH>QhPN*xIm({V
zo1r|$L|ASnTPl^1NpSM5`J?67(%rA}bJVC4$;4|yJDy~UDweKOdlghw)^j?6rwSW5
z6VeMaS9)zEkLM_xZ)`V{mJ(Yk^`|%=NB(VeWb6QPZ%HNPWRucHoD(;TM_UV+_m7#)
zOj1v;ElhpatBr(cgESx9n%UIH%#`o!2r527ynN3dlO5MNd{vv|{W^6W-da^$y`84R
zTPZr+OV5F=^~BX%cICiU3Jw?rodZU08__B{8_4NsVErcrC{z}TUSFv5b(SNUfFnKP
zHbmM&cu{`HgbRsNI}Q;9IBXY-WpCATq&*eXnl30aL#^PY42yX$l8G0_iX)cr4swW*
u9J*T6or~^hXOSpdt0Es}R908U<Px!3B8P3lLXZDGKAJ@?do-Kn1@>=T`O(4v

literal 0
HcmV?d00001

diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt
index f90147b013023..662b9275aa7aa 100644
--- a/paddle/phi/kernels/funcs/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/CMakeLists.txt
@@ -1,4 +1,4 @@
-#add_subdirectory(eigen)
+add_subdirectory(eigen)
 add_subdirectory(blas)
 add_subdirectory(lapack)
 add_subdirectory(detail)
diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h
index 203f6837d4611..ef13b248f4c90 100644
--- a/paddle/phi/kernels/funcs/activation_functor.h
+++ b/paddle/phi/kernels/funcs/activation_functor.h
@@ -2566,7 +2566,7 @@ struct SquareGradGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || defined(__xpu__)
 
 template <typename T>
 struct CudaLogitFunctor : public BaseActivationFunctor<T> {
diff --git a/paddle/phi/kernels/funcs/algorithm.h b/paddle/phi/kernels/funcs/algorithm.h
index 5f66f6f1abd4d..4c4bf031b4338 100644
--- a/paddle/phi/kernels/funcs/algorithm.h
+++ b/paddle/phi/kernels/funcs/algorithm.h
@@ -40,7 +40,7 @@ HOSTDEVICE inline int64_t BinarySearch(const T *x, int64_t num, const T &val) {
 
 template <typename T1, typename T2>
 HOSTDEVICE inline size_t LowerBound(const T1 *x, size_t num, const T2 &val) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)  // @{ Group LowerBound
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__)  // @{ Group LowerBound
   // The following code is from
   // https://en.cppreference.com/w/cpp/algorithm/lower_bound
   auto *first = x;
@@ -63,7 +63,7 @@ HOSTDEVICE inline size_t LowerBound(const T1 *x, size_t num, const T2 &val) {
 
 template <typename T1, typename T2>
 HOSTDEVICE inline size_t UpperBound(const T1 *x, size_t num, const T2 &val) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)  // @{ Group UpperBound
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__)  // @{ Group UpperBound
   // The following code is from
   // https://en.cppreference.com/w/cpp/algorithm/upper_bound
   auto *first = x;
diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h
index e754ce3bf49e4..5f19522d28f18 100644
--- a/paddle/phi/kernels/funcs/broadcast_function.h
+++ b/paddle/phi/kernels/funcs/broadcast_function.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <sstream>
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || defined(__xpu__)
 #include "paddle/phi/kernels/funcs/dims_simplifier.h"
 
 namespace kps = phi::kps;
@@ -27,7 +27,7 @@ namespace kps = phi::kps;
 namespace phi {
 namespace funcs {
 
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || defined(__xpu__)
 
 enum BroadcastLoadType { kMixed = 1, kBroadcast = 2, kElementwise = 3 };
 
diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.cu b/paddle/phi/kernels/funcs/concat_and_split_functor.cu
index 5a7574b56a891..3086d5dc4ed14 100644
--- a/paddle/phi/kernels/funcs/concat_and_split_functor.cu
+++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cu
@@ -21,6 +21,10 @@ limitations under the License. */
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/kernels/funcs/segmented_array.h"
 
+#ifdef PADDLE_WITH_MUSA
+#include "paddle/phi/backends/gpu/musa/musa_helper.h"
+#endif
+
 namespace phi {
 namespace funcs {
 
diff --git a/paddle/phi/kernels/funcs/cross_entropy.cu b/paddle/phi/kernels/funcs/cross_entropy.cu
index add838106bfe8..00e885eeac5a1 100644
--- a/paddle/phi/kernels/funcs/cross_entropy.cu
+++ b/paddle/phi/kernels/funcs/cross_entropy.cu
@@ -154,9 +154,11 @@ void CrossEntropyFunctor<DeviceContext, T>::operator()(
 template class CrossEntropyFunctor<phi::GPUContext, float>;
 template class CrossEntropyFunctor<phi::GPUContext, double>;
 template class CrossEntropyFunctor<phi::GPUContext, phi::dtype::float16>;
-#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(8, 1, 0)
+#if defined(PADDLE_WITH_CUDA)
+#if CUDNN_VERSION_MIN(8, 1, 0)
 template class CrossEntropyFunctor<phi::GPUContext, phi::dtype::bfloat16>;
 #endif
+#endif
 
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/diagonal.h b/paddle/phi/kernels/funcs/diagonal.h
index a30fb79f8c8b0..f0235f0baec5f 100644
--- a/paddle/phi/kernels/funcs/diagonal.h
+++ b/paddle/phi/kernels/funcs/diagonal.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 
@@ -109,7 +109,7 @@ DenseTensor Diagonal(const DeviceContext& context,
 
     int64_t pos = std::abs(offset) * offset_stride;
     int64_t dim_size = ret_strides.size();
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
     thrust::device_vector<int64_t> diag_vec(vectorize(dig_stride));
     const int64_t* diag_arr = thrust::raw_pointer_cast(diag_vec.data());
     thrust::device_vector<int64_t> ret_vec(ret_strides);
@@ -146,7 +146,7 @@ std::vector<T> ComputeDimStride(const std::vector<T> dim) {
   return dim_strides;
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 template <typename T, int X_DIM_SIZE, int OUT_DIM_SIZE>
 __global__ void DiagonalCuda(const T* data1,
                              T* data2,
diff --git a/paddle/phi/kernels/funcs/distribution_helper.h b/paddle/phi/kernels/funcs/distribution_helper.h
index abade7ac0ef87..2ae5c912db937 100644
--- a/paddle/phi/kernels/funcs/distribution_helper.h
+++ b/paddle/phi/kernels/funcs/distribution_helper.h
@@ -17,6 +17,9 @@ limitations under the License. */
 #ifdef __NVCC__
 #include <curand_kernel.h>
 #endif
+#ifdef __MUSACC__
+#include <murand_kernel.h>
+#endif
 #ifdef __HIPCC__
 #include <hiprand_kernel.h>
 #endif
@@ -28,7 +31,7 @@ limitations under the License. */
 #include "paddle/phi/core/generator.h"
 #include "paddle/phi/core/hostdevice.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #include "paddle/phi/kernels/funcs/index_impl.cu.h"
 #include "paddle/phi/kernels/primitive/kernel_primitives.h"
 #endif
@@ -49,7 +52,7 @@ struct exponential_transform {
   explicit exponential_transform(T lambda) : lambda_(lambda) {}
 
   HOSTDEVICE inline T operator()(T val) const {
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
     T log = -std::numeric_limits<T>::epsilon() / 2;
     if (val < static_cast<T>(1.) - std::numeric_limits<T>::epsilon() / 2) {
       if (std::is_same<T, double>::value) {
@@ -113,7 +116,7 @@ struct normal_transform {
   T std_;
 };
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 
 namespace kps = phi::kps;
 
@@ -186,6 +189,69 @@ struct normal_distribution<double> {
   static constexpr int kReturnsCount = 2;
 };
 
+#elif defined(__MUSACC__)
+template <typename T>
+struct uniform_distribution {
+  __device__ inline T operator()(murandStatePhilox4_32_10_t *state) const {
+    return static_cast<T>(murand_uniform(state));
+  }
+  static constexpr int kReturnsCount = 1;
+};
+
+template <>
+struct uniform_distribution<float> {
+  __device__ inline float4 operator()(murandStatePhilox4_32_10_t *state) const {
+    return murand_uniform4(state);
+  }
+  static constexpr int kReturnsCount = 4;
+};
+
+template <>
+struct uniform_distribution<double> {
+  __device__ inline double2 operator()(
+      murandStatePhilox4_32_10_t *state) const {
+    return murand_uniform2_double(state);
+  }
+  static constexpr int kReturnsCount = 2;
+};
+
+template <>
+struct uniform_distribution<uint32_t> {
+  __device__ inline uint4 operator()(murandStatePhilox4_32_10_t *state) const {
+    return murand4(state);
+  }
+  static constexpr int kReturnsCount = 4;
+};
+
+template <>
+struct uniform_distribution<uint64_t> {
+  __device__ inline ulonglong2 operator()(
+      murandStatePhilox4_32_10_t *state) const {
+    ulonglong2 result;
+    uint4 rand = murand4(state);
+    result.x = (uint64_t)rand.x << 32 | rand.y;
+    result.y = (uint64_t)rand.z << 32 | rand.w;
+    return result;
+  }
+  static constexpr int kReturnsCount = 2;
+};
+
+template <>
+struct normal_distribution<float> {
+  __device__ inline float4 operator()(murandStatePhilox4_32_10_t *state) const {
+    return murand_normal4(state);
+  }
+  static constexpr int kReturnsCount = 4;
+};
+
+template <>
+struct normal_distribution<double> {
+  __device__ inline double2 operator()(
+      murandStatePhilox4_32_10_t *state) const {
+    return murand_normal2_double(state);
+  }
+  static constexpr int kReturnsCount = 2;
+};
 #else
 template <typename T>
 struct uniform_distribution {
diff --git a/paddle/phi/kernels/funcs/eigen/.extensions.h.swp b/paddle/phi/kernels/funcs/eigen/.extensions.h.swp
new file mode 100644
index 0000000000000000000000000000000000000000..e41cbfca9e327039544418de0e5a9e6d0fc9fc8d
GIT binary patch
literal 16384
zcmeI3Uu+yl9ml7o(AH2|1&J4m_{Emc862O_N$liojEv*7N0M`~oir__$K3Ah-N4@N
zdUrNw69rHJsS=_Pq#_}~Ye+%x(n?4a)TT%f@xTMLf`|429{SK1B2;*2Lg71myY~5f
z{*&d80L@9CoNx9wzu(Mve!DxfJE@PIshpxCqa}vVeT;qW_*MJ<$3|{{t;$&F`$`w@
z_9-2moH#n^OqT2kzLRaS>9<5N;I{3GS}Trt`gbvC%Hn55=m|F}I<aR(MIjr)D~eSm
zyh!?9G}_$Bu{mZGFbeEJfqU4&@zR(!92(qD@7Z^H4|R-*QNSo*6fg=H1&jhl0i%FX
zz$ozlqJRqSU|&FIZcjUuE%<Lse`nu+Pe13=^0?>tTj}v9)AH**$N!cdpGwO=pE}O`
zr^V*WC}0#Y3K#{90!9I&fKk9GU=%P47zK<1MuC5W0^DZoUiiJC7vgyTpRNC&N8wMw
z8aNCFz@6abcQf`I@Ktaeym=2}&wxwd8{kXeA_&0>sDMFm<6VsX7Ca5EfNz1PKmZ;E
z<DdlI2kr)Uft&9{Ja8F&7d!!0z@y*@I0(kUUk4a_9y|xyU;&%}_k!Qw&Dcxe+n^2}
z2m8V6cQN)FxB^x{3w#Dh@F}nl+`N;qUx1&1?*joQzyNsR9f${h0xpA3gOgwgyzzF%
zu7g*=Q{bE668H)@0``N~?_lf?;Md>@a2AY!KfaBzSHUab8n_5NFc01f_JOzXu;$0$
zS?~<_9`Jz!J^{*L2;2s4VqEpuzS-wN3(<usM))j>l~{(N;z=dBD?i5-_FT|M$l-1z
z5{pVS=eGh^tY)PnX>r9<q9MZG;w9PegpE=a#zNZ)FK*F!7U8$0s2aDHLoQV`?YKTy
zrHL6T(*wd2p|tuOJI|Crbgl$V&oGv>I#Ot*MbdMnC-9}nmJN6;jTMIJJcUBVq1P2?
zQE+#rT;4F6q6=u(eC2`pYVFwknabRJ?fB{H9LjV#ZSNs&wbp7uC@iVBrWA&?hMQZX
zW0Bo!hwk@}*Qc@wBI#=9J9K5;X1kssTspv~Vt3Au6vpEaeQ2vF+2Ie<c)!E5wykT!
z9Z~u>JndxK$qvYnRNP{0`*GbB7>&##v+tjV8yV46v5#q%lN+g8>PkH#wH63lILWui
zTcS&db;l>A5+Oz}{nlffRqNQ`WZ~}A)<m{C=f`yFsc%9@$}X3;7@F+vccT99W_NN)
z%aPn0_5QWFo%!7%wG)%wGP~nv_r%WZ`sMYeODneh;exDBuW~u$?OKehcxj@h{8~Nn
zQJN@>tvZfXN-nVvE-Wo~uG2kN+io{mMS_Gv)vCOzFQGVzZ$0p&b7U+>VCgv$_tCZ7
zaUEyU$r0D}+aglAqhkAJo~TXPXzn3wHY|mbIB+;moc=e{TOZVehsN>@>iWe9xulax
z-xmIp(<U9Q&jhJo=Fjw-W%V`OJgMrPnN4*a)~9H4!(7JfD~_bsp!6<=TYI<f?B2FK
zvt9L0?ez5SshyVM6`>9m1>6!euQvi~)rX{Kxv?#1F!rK22>eh9yO<m<&YeA5DixQp
zSJ)2+yDU3fjg#%gmLCf2KgJzcoRgLqZEkDVYWZF<ZOU$DX@7S!%hF%lp?Qv{XCbxP
zY7p0IC<sE{XmRpAOE6X}k}F$M&u+2rnPS+6q?!UV5(^y^quOFgi0(3S{Ekf&U{>m2
zTI_PK5%Y$ihJOwltM!NlB5cVh*-cH{v$dOtE6WcqoL;7fXBQV|tIL)7C0badxrOSn
z%5r6)ieJZRw)zO2s8o-QkdSCbNMaRp-zXx4l$exDVJA(pB{E3XV~onyMw5&%J>M|E
z>VkVTIJ-oZr9t}G>{4ZEL^tTP=W5qsXFXz5eT{g47#7ET5uIx5sJsyh1W`U_{nQSz
zX7L&$6#0&7^H9Kz_M|^erGj#?*lxE+Ig+U<M*Xl+gg32YR6L0(e|2g8!1(A`TImr#
zrWRi#zpglZb7XmdUzjm&=ucc}|7)qK=PQaMJsRkuBhgB+YKcJUwk6piIfj+m(5~A{
zB3U}3-FnR??aan{gwBXij~JniRM4#$Oe+oI7SZYH9NHM-GOVaz4b%A8_&7b3Y_aeB
zOS7?R`e8Imv#v{v+AzYMJ`&+MJ#u>ee;MoHA7QPm*Z<l3`8TlEe-T^)Uk6Wui{K&f
z0q}lsKe!LPist}708fJ};5$HqNpKh(0tN6FJO}tWxC&kX&w?+4Q{V*nIG6zg;CER6
zzYJai*T9#-G$@0k-~hORwg0ccHSkOD95@Fa2akbIf(0-D{*3-z2Y=Gf1i*{nD$xD@
zAuu1KfKk9GU=%P47zK<1MuGoE1%`xYONR&(@b%@>h2&??ijnn};`IamH!0p7u=OYV
zoTjNx9|dLNQ>^RxQu<OMTcx)6sw`MojVVftK1h57)B8;&b^V}e^LvVoSarnWdy0jJ
zNB?B3w2g&w+2Qr5(0RL1vv{OBZ(C*xe0UhkhwVG8tM&Ab@6@}Eh=JdB`bWeHEqP<J
zsUh`U8UlA)8ggsGZ!|6Rv6kMZaXZVA)ZEgJy$OQ&ewZt2QZr{%OGr0YP+c)+%xEQ4
lxnk;yIb$mBHFL$(6?4XP{YI{ksFyRM2%TIJQE%6X{{XF__E`V`

literal 0
HcmV?d00001

diff --git a/paddle/phi/kernels/funcs/eigen/.slice.cu.swp b/paddle/phi/kernels/funcs/eigen/.slice.cu.swp
new file mode 100644
index 0000000000000000000000000000000000000000..4a7c14392e2fa888dd926bec2df93fa056dc2421
GIT binary patch
literal 12288
zcmeI2O>7%Q6vwB06jCT4pb|(N9xjTK%6J>Mq>Y<ea+4Gcty9@ftBQ)U_KxkL>z!qH
zHcmhV2?=rF$PI+V4Q_BmNWcM*%8@Gw!2u*d;=;!T3B-Ta>oks2+l_<}Xjb}Vv%7EJ
zzWKk|nHObh2hSEy)6vWf!*Lg5)-NAAzdxD2;g6pf3#3%~_NZfK?#S%ik;ai3XVxBO
zYqeyXTfXf$F2C50LOc20@>{~X!~>7Jq1A}IdT4Rc<en9}qRun*XqZ#;%qU<K7*m10
z>`->*pf=v0dXVn9du7aa#>^;S6fg=H1&jhl0i%FXz$jo8_&+G1{9WutbmWGlGmj_d
zU0cq{r8yY|i~>dhqkvJsC}0#Y3K#{90!9I&fKk9G@E=sbb{Kp4M#etB3Blw4|GmHe
zKfaT(kH8DyIM@g71>fJn*mvM9@D$hwzSzsyyWkb@G?)W_-Okvb;Aij^_!4{oUIi8K
z5ZDFYzm2gsK^YtXyTK=W7<(JkKn~1-3Gm|tW1oUIfDiH@2j;*8_~ur|-UBazG9d5(
zxF6gHzP*L9cYqI$0tT+^X6!W(fD*`nJ>Z9%8T$;p4m>aqCctmd=~wVI(3)KV<}eBv
z1^#6PE)!W6d3KwJzFp^pJw<Hmy?AdC_C7?O!y8mARjP$jwOFVYmrJbgYHCUEKdR{M
zM`?1@y3cdDx@`L{Uz>MiRC9TL`q=nQ8m_d}SS@O?>CEgn-p4YHgB|KEb~DsMcxv|W
zMK#K1=<^6pj*jqAlhTC~BE=!KtvH1$i0X<?u9Oz4%jNkvE4kdGezeF}<J?rwpIJ)N
zY<>rg9?2i03p;3cc!%u{?XX>ThwWzKcKb2Xg~$5``&1%mnzVk?G;myC?6OY8Wk4h&
zm2qOhr*N94<0QOHyf<%3HX0#UVVd}w;+_iQ^yxNzM_qcMqCybZ>xZ%@gqqwir{`Ip
zW%pvxzW1JMTNk}5KGWEJ5uf+kB!WzDXTBV%eoNgyZ5b+Ci8`%H;n32m?M`*W*gEJY
z2_mFXUHk*;T!%1|G_EG2l-P0qX}cqltKf>J$DO<$Mfyl?SGgC;;I!@czUA(X`D&WV
zws$Grvn$KaY5N%&q>)9H*V&0e;WcS6dB^cgAAK(v;xDmVL?ApKDET<XKKB^meeQI;
zEPl)2kHqjg``aB_<Mw*%*rx3}J<93Ou6N&Yjh=Vax%vTLHCwER=Xf}eEOj%uNzB-M
zz;Ymnjdsao9~;C=Z*Pojp6?TD%o`hv#Jpag^}<ik6M@7dDUY&!qZWwQeKfgx2h{<t
z*?!^G-N@mT>h4Ou=1yd;>w$~fj}p?$v{KtQMUBvwo|UZ3qs)?uVw730CeU`SXbU|}
zVL~Nc#RQ~UoK9ga#Bk2g0gF{Q-+Yf6vTGCiye=A;-(1^kMs|}^Q?6pqwf?Gn9<)Um
z#$IjDVdsj~$Cg*Bbgob?>k4V9Ld#`ZSS~FVb*+S}lT;|3rzeV~MNItyF$9>>{eXud
z;e7(jx8P2OIRZ%)wMcPiFkBKLZ8ne|*&d||6)ILzbfQoxR?=+6bG6feKO2bvUTW)P
z`|w`3QG}r{9sNGL8E`yP66vQ-AQU>4rcgFi#}2rjs)<@k`8mt#bUGOu1!9Y5WYDy9
zg$LzA>r`=JsZ?2dIGe%t*z+=?w!KcWrfhh$sV;q7HTs0L6<bg2JgG20Y2DGKhCpC^
zt<HU=!-<nDY!4-A2SROjkzq;VHj7h;gX^U<O=o$a6GI4$rgXAXNuw0jT4o)f1;iGJ
pW=qjjeVVcdvspS5Z?W(Gq(Y=xG6*wNa9t{E!;s26<iV=W(BH4L(g^?n

literal 0
HcmV?d00001

diff --git a/paddle/phi/kernels/funcs/eigen/erf.cc b/paddle/phi/kernels/funcs/eigen/erf.cc
index 63d3bba30f99a..d24f2654eee24 100644
--- a/paddle/phi/kernels/funcs/eigen/erf.cc
+++ b/paddle/phi/kernels/funcs/eigen/erf.cc
@@ -28,7 +28,7 @@ struct EigenErf<Eigen::DefaultDevice, T> {
   static void Eval(const Eigen::DefaultDevice& dev,
                    OutType out,
                    const InType& in) {
-    out.device(dev) = in.erf();
+    //out.device(dev) = in.erf();
   }
 };
 
@@ -42,8 +42,8 @@ struct EigenErfGrad<Eigen::DefaultDevice, T> {
                    OutType din,
                    const InType& in,
                    const InType& dout) {
-    din.device(dev) =
-        dout * static_cast<T>(M_2_SQRTPI) * (-(in.square())).exp();
+    //din.device(dev) =
+    //    dout * static_cast<T>(M_2_SQRTPI) * (-(in.square())).exp();
   }
 };
 
diff --git a/paddle/phi/kernels/funcs/eigen/extensions.h b/paddle/phi/kernels/funcs/eigen/extensions.h
index c724564417b19..4189faea8faa9 100644
--- a/paddle/phi/kernels/funcs/eigen/extensions.h
+++ b/paddle/phi/kernels/funcs/eigen/extensions.h
@@ -131,7 +131,7 @@ struct NumTraits<float16> : GenericNumTraits<float16> {
     return phi::dtype::raw_uint16_to_float16(0x7c01);
   }
 };
-
+#if 0
 namespace numext {
 
 //////////// bfloat methods /////////////
@@ -435,6 +435,7 @@ HOSTDEVICE inline float16 maxi(const float16& a, const float16& b) {
 }
 
 }  // namespace numext
+#endif
 }  // namespace Eigen
 
 #endif  // __xpu__
diff --git a/paddle/phi/kernels/funcs/eigen/pad.cu b/paddle/phi/kernels/funcs/eigen/pad.cu
index c4a3dd9ecc4f5..42ac4e51de261 100644
--- a/paddle/phi/kernels/funcs/eigen/pad.cu
+++ b/paddle/phi/kernels/funcs/eigen/pad.cu
@@ -39,7 +39,7 @@ struct EigenPad<Eigen::GpuDevice, T, Rank> {
                    const InType& in,
                    const Array& padding,
                    const T value) {
-    out.device(dev) = in.pad(padding, value);
+    //out.device(dev) = in.pad(padding, value);
   }
 
   static void Eval32(const Eigen::GpuDevice& dev,
@@ -47,7 +47,7 @@ struct EigenPad<Eigen::GpuDevice, T, Rank> {
                      const InType32BitIndex& in,
                      const Array32Bit& padding,
                      const T value) {
-    out.device(dev) = in.pad(padding, value);
+    //out.device(dev) = in.pad(padding, value);
   }
 };
 
diff --git a/paddle/phi/kernels/funcs/eigen/slice.cu b/paddle/phi/kernels/funcs/eigen/slice.cu
index ade58d0698759..64d7e243bc38d 100644
--- a/paddle/phi/kernels/funcs/eigen/slice.cu
+++ b/paddle/phi/kernels/funcs/eigen/slice.cu
@@ -39,7 +39,7 @@ struct EigenSlice<Eigen::GpuDevice, T, Rank> {
                    const InType& in,
                    const Array& offsets,
                    const Array& extents) {
-    out.device(dev) = in.slice(offsets, extents);
+    //out.device(dev) = in.slice(offsets, extents);
   }
 
   static void Eval(const Eigen::GpuDevice& dev,
@@ -47,7 +47,7 @@ struct EigenSlice<Eigen::GpuDevice, T, Rank> {
                    const InType32BitIndex& in,
                    const Array32Bit& offsets,
                    const Array32Bit& extents) {
-    out.device(dev) = in.slice(offsets, extents);
+    //out.device(dev) = in.slice(offsets, extents);
   }
 };
 
diff --git a/paddle/phi/kernels/funcs/elementwise_base.h b/paddle/phi/kernels/funcs/elementwise_base.h
index 274ac1cc32c05..683696f810c80 100644
--- a/paddle/phi/kernels/funcs/elementwise_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_base.h
@@ -22,7 +22,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/elementwise_utils.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || defined(__xpu__)
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
 #include "paddle/phi/kernels/funcs/function_traits.h"
@@ -151,7 +151,7 @@ class MidWiseTransformIterator<T, CPUContext>
   int64_t post_;
 };
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 template <typename T>
 class RowwiseTransformIterator<T, GPUContext>
     : public thrust::iterator_adaptor<RowwiseTransformIterator<T, GPUContext>,
@@ -486,7 +486,7 @@ inline void ElementwiseGradPreProcess(const DenseTensor &dout,
   }
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || defined(__xpu__)
 
 // static unroller
 template <template <int Index, int VecSize> typename Func,
diff --git a/paddle/phi/kernels/funcs/elementwise_functor.h b/paddle/phi/kernels/funcs/elementwise_functor.h
index b7994d9cefa51..325fe5b7b39ff 100644
--- a/paddle/phi/kernels/funcs/elementwise_functor.h
+++ b/paddle/phi/kernels/funcs/elementwise_functor.h
@@ -848,7 +848,7 @@ struct InverseFloorDivideFunctor<dtype::bfloat16> {
   }
 };
 
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__)
 template <typename T, typename MPType>
 inline HOSTDEVICE typename std::enable_if<std::is_integral<T>::value, T>::type
 compute_pow(const T a, const T b) {
diff --git a/paddle/phi/kernels/funcs/elementwise_grad_base.h b/paddle/phi/kernels/funcs/elementwise_grad_base.h
index 49f593289a9f1..15af296d0d71f 100644
--- a/paddle/phi/kernels/funcs/elementwise_grad_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_grad_base.h
@@ -24,7 +24,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/elementwise_utils.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/kernels/primitive/kernel_primitives.h"
@@ -405,7 +405,7 @@ void ElemwiseGradComputeNoBroadcast(const DeviceContext &dev_ctx,
       dy == nullptr ? nullptr : dev_ctx.template Alloc<T>(dy)});
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 // Suppose only has contiguous dims
 static inline bool CheckContiguousDims(const std::vector<int> &broadcast_pos) {
   for (int i = 1; i < broadcast_pos.size(); ++i) {
diff --git a/paddle/phi/kernels/funcs/fft.cu b/paddle/phi/kernels/funcs/fft.cu
index 42786f4b64355..f9015840abe88 100644
--- a/paddle/phi/kernels/funcs/fft.cu
+++ b/paddle/phi/kernels/funcs/fft.cu
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #pragma once
-
+#ifdef PADDLE_WITH_CUDA
 #include <cmath>
 
 #include "paddle/phi/kernels/funcs/fft.h"
@@ -349,3 +349,4 @@ template struct FFTR2CFunctor<phi::GPUContext, double, complex128_t>;
 
 }  // namespace funcs
 }  // namespace phi
+#endif
diff --git a/paddle/phi/kernels/funcs/fft_fill_conj.h b/paddle/phi/kernels/funcs/fft_fill_conj.h
index 91d859020f88b..c57554cdd9971 100644
--- a/paddle/phi/kernels/funcs/fft_fill_conj.h
+++ b/paddle/phi/kernels/funcs/fft_fill_conj.h
@@ -18,7 +18,7 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #include "thrust/device_vector.h"
 #endif
 
@@ -156,7 +156,7 @@ void FFTFillConj(const DeviceContext& ctx,
     _is_fft_axis[i] = true;
   }
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
   const thrust::device_vector<int64_t> src_strides_g(src_strides_v);
   const auto src_strides = thrust::raw_pointer_cast(src_strides_g.data());
   const thrust::device_vector<int64_t> dst_strides_g(dst_strides_v);
diff --git a/paddle/phi/kernels/funcs/for_range.h b/paddle/phi/kernels/funcs/for_range.h
index 9648a7d845ff0..c4726c5d99287 100644
--- a/paddle/phi/kernels/funcs/for_range.h
+++ b/paddle/phi/kernels/funcs/for_range.h
@@ -42,7 +42,7 @@ struct ForRange<phi::CPUContext> {
   size_t limit_;
 };
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 
 template <typename Function>
 __global__ static void ForRangeElemwiseOpGridIsOne(Function func) {
diff --git a/paddle/phi/kernels/funcs/gather_scatter_functor.cu b/paddle/phi/kernels/funcs/gather_scatter_functor.cu
index b53de3beef9aa..641eaaf850033 100644
--- a/paddle/phi/kernels/funcs/gather_scatter_functor.cu
+++ b/paddle/phi/kernels/funcs/gather_scatter_functor.cu
@@ -34,7 +34,7 @@ class ReduceAdd {
       typename tensor_t,
       std::enable_if_t<!std::is_same<tensor_t, uint8_t>::value>* = nullptr>
   __device__ void operator()(tensor_t* self_data, tensor_t* src_data) const {
-    phi::CudaAtomicAdd(self_data, *src_data);
+    //phi::CudaAtomicAdd(self_data, *src_data);
   }
   template <typename tensor_t,
             std::enable_if_t<std::is_same<tensor_t, uint8_t>::value>* = nullptr>
diff --git a/paddle/phi/kernels/funcs/gather_scatter_functor.h b/paddle/phi/kernels/funcs/gather_scatter_functor.h
index 56068f9459ebd..3a1c5458bf623 100644
--- a/paddle/phi/kernels/funcs/gather_scatter_functor.h
+++ b/paddle/phi/kernels/funcs/gather_scatter_functor.h
@@ -27,8 +27,6 @@ namespace funcs {
       Instantiate_Template_Function_index_t(                                 \
           func, double) Instantiate_Template_Function_index_t(func, int64_t) \
           Instantiate_Template_Function_index_t(func, phi::dtype::float16)   \
-              Instantiate_Template_Function_index_t(func,                    \
-                                                    phi::dtype::bfloat16)    \
                   Instantiate_Template_Function_index_t(func, unsigned char)
 
 #define Instantiate_Template_Function_index_t(func, tensor_t)          \
diff --git a/paddle/phi/kernels/funcs/im2col.cu b/paddle/phi/kernels/funcs/im2col.cu
index 87c82adbb7fbe..eabaab11bb11f 100644
--- a/paddle/phi/kernels/funcs/im2col.cu
+++ b/paddle/phi/kernels/funcs/im2col.cu
@@ -472,7 +472,7 @@ __global__ void col2imOCF(const T* col_data,
 
         if (height_offset >= 0 && height_offset < im_height &&
             width_offset >= 0 && width_offset < im_width) {
-          phi::CudaAtomicAdd(im_data + im_offset, col_data[col_offset]);
+          //phi::CudaAtomicAdd(im_data + im_offset, col_data[col_offset]);
         }
       }
     }
@@ -576,9 +576,9 @@ template class Im2ColFunctor<phi::funcs::ColFormat::kOCF,
 template class Im2ColFunctor<phi::funcs::ColFormat::kOCF,
                              phi::GPUContext,
                              phi::dtype::float16>;
-template class Im2ColFunctor<phi::funcs::ColFormat::kOCF,
-                             phi::GPUContext,
-                             phi::dtype::bfloat16>;
+//template class Im2ColFunctor<phi::funcs::ColFormat::kOCF,
+//                             phi::GPUContext,
+//                             phi::dtype::bfloat16>;
 template class Col2ImFunctor<phi::funcs::ColFormat::kOCF,
                              phi::GPUContext,
                              float>;
@@ -588,9 +588,9 @@ template class Col2ImFunctor<phi::funcs::ColFormat::kOCF,
 template class Col2ImFunctor<phi::funcs::ColFormat::kOCF,
                              phi::GPUContext,
                              phi::dtype::float16>;
-template class Col2ImFunctor<phi::funcs::ColFormat::kOCF,
-                             phi::GPUContext,
-                             phi::dtype::bfloat16>;
+//template class Col2ImFunctor<phi::funcs::ColFormat::kOCF,
+//                             phi::GPUContext,
+//                             phi::dtype::bfloat16>;
 
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/inclusive_scan.h b/paddle/phi/kernels/funcs/inclusive_scan.h
index 265febd306f33..5e4fced6ef155 100644
--- a/paddle/phi/kernels/funcs/inclusive_scan.h
+++ b/paddle/phi/kernels/funcs/inclusive_scan.h
@@ -17,6 +17,9 @@
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/phi/kernels/funcs/index_calculator.h b/paddle/phi/kernels/funcs/index_calculator.h
index 13697e443e16d..7a108d3756b3a 100644
--- a/paddle/phi/kernels/funcs/index_calculator.h
+++ b/paddle/phi/kernels/funcs/index_calculator.h
@@ -15,7 +15,7 @@
 #pragma once
 
 // CUDA, XPU and HIP use same api
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || defined(__xpu__)
 
 #include <algorithm>
 #include <cmath>
diff --git a/paddle/phi/kernels/funcs/index_put_utils.h b/paddle/phi/kernels/funcs/index_put_utils.h
index c135cb82e2ec3..fd6c511922026 100644
--- a/paddle/phi/kernels/funcs/index_put_utils.h
+++ b/paddle/phi/kernels/funcs/index_put_utils.h
@@ -26,10 +26,13 @@
 #include "paddle/phi/kernels/reshape_kernel.h"
 #include "paddle/phi/kernels/split_kernel.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #ifdef __NVCC__
 #include <cuda.h>
 #include <cuda_runtime.h>
+#elif defined(__MUSACC__)
+#include <musa.h>
+#include <musa_runtime.h>
 #elif defined(__HIPCC__)
 #include <hip/hip_runtime.h>
 #endif
@@ -291,7 +294,7 @@ static void CalCompressedDimsWith1AndWithout1(
   }
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 template <typename T>
 __global__ void range_cuda_kernel(int64_t N, T* out) {
   int64_t idx = threadIdx.x + blockDim.x * blockIdx.x;
diff --git a/paddle/phi/kernels/funcs/interpolate_function.h b/paddle/phi/kernels/funcs/interpolate_function.h
index 23731285926da..af2b1cd4f44db 100644
--- a/paddle/phi/kernels/funcs/interpolate_function.h
+++ b/paddle/phi/kernels/funcs/interpolate_function.h
@@ -19,7 +19,7 @@
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #include "paddle/phi/kernels/primitive/datamover_primitives.h"
 #endif
 
@@ -153,7 +153,7 @@ inline std::vector<T> get_new_data_from_tensor(
   return vec_new_data;
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 using phi::kps::details::FastDivMod;
 
 struct FastDivModForInterpolate {
diff --git a/paddle/phi/kernels/funcs/isfinite_functor.h b/paddle/phi/kernels/funcs/isfinite_functor.h
index d10e7998ba806..e96ec84dc5353 100644
--- a/paddle/phi/kernels/funcs/isfinite_functor.h
+++ b/paddle/phi/kernels/funcs/isfinite_functor.h
@@ -20,7 +20,7 @@ namespace funcs {
 template <typename T, class Enable = void>
 struct IsNanFunctor {
   HOSTDEVICE bool operator()(const T& a) const {
-#if defined(__CUDACC__) || defined(__HIPCC__)
+#if defined(__CUDACC__) || defined(__HIPCC__) || defined(__MUSACC__)
     return ::isnan(a);
 #else
     return std::isnan(a);
@@ -55,7 +55,7 @@ struct IsNanFunctor<phi::dtype::bfloat16, void> {
 template <typename T, class Enable = void>
 struct IsInfFunctor {
   HOSTDEVICE bool operator()(const T& a) const {
-#if defined(__CUDACC__) || defined(__HIPCC__)
+#if defined(__CUDACC__) || defined(__HIPCC__) || defined(__MUSACC__)
     return ::isinf(a);
 #else
     return std::isinf(a);
@@ -86,7 +86,7 @@ struct IsInfFunctor<phi::dtype::bfloat16, void> {
 template <typename T, class Enable = void>
 struct IsFiniteFunctor {
   HOSTDEVICE bool operator()(const T& a) const {
-#if defined(__CUDACC__) || defined(__HIPCC__)
+#if defined(__CUDACC__) || defined(__HIPCC__) || defined(__MUSACC__)
     return ::isfinite(a);
 #else
     return std::isfinite(a);
diff --git a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
index b7aa46dcb004e..a810833c36ac3 100644
--- a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
+++ b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
@@ -17,6 +17,9 @@ limitations under the License. */
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/phi/kernels/funcs/math_function.cc b/paddle/phi/kernels/funcs/math_function.cc
index 7c2fd866e3b91..b1bd0de25f088 100644
--- a/paddle/phi/kernels/funcs/math_function.cc
+++ b/paddle/phi/kernels/funcs/math_function.cc
@@ -239,7 +239,7 @@ void set_constant(const phi::DeviceContext& context,
     return;
   }
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)// || defined(PADDLE_WITH_MUSA)
   // tensor->place().apply_visitor(func);
   phi::VisitPlace(tensor->place(), func);
 #elif defined(PADDLE_WITH_XPU)
diff --git a/paddle/phi/kernels/funcs/mode.h b/paddle/phi/kernels/funcs/mode.h
index 632b0ce7e1510..b241d341b7df1 100644
--- a/paddle/phi/kernels/funcs/mode.h
+++ b/paddle/phi/kernels/funcs/mode.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
 #include <thrust/extrema.h>
@@ -143,7 +143,7 @@ static void ModeAssign(const Type& input_height,
   }
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 template <typename T>
 static void GetModebySort(const phi::GPUContext& dev_ctx,
                           const DenseTensor* input_tensor,
diff --git a/paddle/phi/kernels/funcs/mufft_util.h b/paddle/phi/kernels/funcs/mufft_util.h
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/paddle/phi/kernels/funcs/norm_utils.cu.h b/paddle/phi/kernels/funcs/norm_utils.cu.h
index 80f37750adcf9..b433efdd95f4f 100644
--- a/paddle/phi/kernels/funcs/norm_utils.cu.h
+++ b/paddle/phi/kernels/funcs/norm_utils.cu.h
@@ -20,6 +20,9 @@ limitations under the License. */
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/phi/kernels/funcs/random.cuh b/paddle/phi/kernels/funcs/random.cuh
index 502b7e85ee97f..fad9caf2a1d14 100644
--- a/paddle/phi/kernels/funcs/random.cuh
+++ b/paddle/phi/kernels/funcs/random.cuh
@@ -18,6 +18,9 @@
 #ifdef __NVCC__
 #include <cuda_runtime_api.h>  // NOLINT
 #endif
+#ifdef __MUSACC__
+#include <musa_runtime_api.h>  // NOLINT
+#endif
 
 class RandomNumGen {
  public:
diff --git a/paddle/phi/kernels/funcs/reduce_function.h b/paddle/phi/kernels/funcs/reduce_function.h
index 5e738d431dfa6..cda3b03732247 100644
--- a/paddle/phi/kernels/funcs/reduce_function.h
+++ b/paddle/phi/kernels/funcs/reduce_function.h
@@ -15,7 +15,7 @@
 #pragma once
 
 // CUDA, XPU and HIP use same api
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || defined(__xpu__)
 
 #include <algorithm>
 #include <cmath>
@@ -27,6 +27,10 @@
 #include "cub/cub.cuh"
 #endif
 
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
+
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
@@ -67,7 +71,7 @@ using dim3 = phi::kps::dim3;
 namespace phi {
 namespace funcs {
 
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || defined(__xpu__)
 namespace details {
 
 // Check if reduce rand is valid
diff --git a/paddle/phi/kernels/funcs/segment_pooling.cu b/paddle/phi/kernels/funcs/segment_pooling.cu
index 0b6df55bdeff1..81b8930c38298 100644
--- a/paddle/phi/kernels/funcs/segment_pooling.cu
+++ b/paddle/phi/kernels/funcs/segment_pooling.cu
@@ -24,7 +24,7 @@ limitations under the License. */
 
 namespace phi {
 namespace funcs {
-
+#if 0
 using Tensor = DenseTensor;
 
 template <typename T, typename Index, int DimTileSize>
@@ -61,7 +61,7 @@ __global__ void SegmentSumIdsKernel(const Index* segment_ids,
         }
         if (j > 0) {
           if (last_segment_id == first_segment_id) {
-            phi::CudaAtomicAdd(summed_ids + last_segment_id, sum);
+            //phi::CudaAtomicAdd(summed_ids + last_segment_id, sum);
           } else {
             *(summed_ids + last_segment_id) = sum;
           }
@@ -71,7 +71,7 @@ __global__ void SegmentSumIdsKernel(const Index* segment_ids,
       sum += T(1);
       last_segment_id = current_segment_id;
     }
-    phi::CudaAtomicAdd(summed_ids + last_segment_id, sum);
+    //phi::CudaAtomicAdd(summed_ids + last_segment_id, sum);
   }
 }
 
@@ -112,8 +112,8 @@ __global__ void SegmentMeanKernel(const Index* segment_ids,
               last_segment_id * inner_dim_size + segment_offset;
 
           if (last_segment_id == first_segment_id) {
-            phi::CudaAtomicAdd(output + output_index,
-                               sum / *(summed_ids + last_segment_id));
+            //phi::CudaAtomicAdd(output + output_index,
+            //                   sum / *(summed_ids + last_segment_id));
           } else {
             *(output + output_index) = sum / *(summed_ids + last_segment_id);
           }
@@ -124,8 +124,8 @@ __global__ void SegmentMeanKernel(const Index* segment_ids,
       last_segment_id = current_segment_id;
     }
     Index output_index = last_segment_id * inner_dim_size + segment_offset;
-    phi::CudaAtomicAdd(output + output_index,
-                       sum / *(summed_ids + last_segment_id));
+    //phi::CudaAtomicAdd(output + output_index,
+    //                   sum / *(summed_ids + last_segment_id));
   }
 }
 
@@ -236,7 +236,8 @@ class SumPool {
   DEVICE inline T initial() { return static_cast<T>(0); }
   DEVICE inline void compute(const T& x, T* y) { *y = *y + x; }
   DEVICE inline T atomic(T* address, const T val) {
-    return phi::CudaAtomicAdd(address, val);
+    //return phi::CudaAtomicAdd(address, val);
+    return val;
   }
 };
 
@@ -466,6 +467,6 @@ template class SegmentPoolGradFunctor<GPU, float16, int>;
 template class SegmentPoolGradFunctor<GPU, float16, int64_t>;
 template class SegmentPoolGradFunctor<GPU, phi::dtype::bfloat16, int>;
 template class SegmentPoolGradFunctor<GPU, phi::dtype::bfloat16, int64_t>;
-
+#endif
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/select_impl.cu.h b/paddle/phi/kernels/funcs/select_impl.cu.h
index b3684c4d4e0ba..13b3bcc4d810f 100644
--- a/paddle/phi/kernels/funcs/select_impl.cu.h
+++ b/paddle/phi/kernels/funcs/select_impl.cu.h
@@ -19,6 +19,9 @@
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/phi/kernels/funcs/softmax.cu b/paddle/phi/kernels/funcs/softmax.cu
index 55c24e8c980ff..c426e6c15e79f 100644
--- a/paddle/phi/kernels/funcs/softmax.cu
+++ b/paddle/phi/kernels/funcs/softmax.cu
@@ -147,10 +147,10 @@ template class SoftmaxCUDNNFunctor<float, phi::GPUContext>;
 template class SoftmaxCUDNNFunctor<phi::dtype::float16, phi::GPUContext>;
 template class SoftmaxGradCUDNNFunctor<float, phi::GPUContext>;
 template class SoftmaxGradCUDNNFunctor<phi::dtype::float16, phi::GPUContext>;
-#if CUDNN_VERSION_MIN(8, 1, 0)
-template class SoftmaxCUDNNFunctor<phi::dtype::bfloat16, phi::GPUContext>;
-template class SoftmaxGradCUDNNFunctor<phi::dtype::bfloat16, phi::GPUContext>;
-#endif
+//#if CUDNN_VERSION_MIN(8, 1, 0)
+//template class SoftmaxCUDNNFunctor<phi::dtype::bfloat16, phi::GPUContext>;
+//template class SoftmaxGradCUDNNFunctor<phi::dtype::bfloat16, phi::GPUContext>;
+//#endif
 
 // MIOPEN do not support double
 #ifndef PADDLE_WITH_HIP
diff --git a/paddle/phi/kernels/funcs/squared_l2_norm.h b/paddle/phi/kernels/funcs/squared_l2_norm.h
index c77552822bbfb..b997ef75d5ca5 100644
--- a/paddle/phi/kernels/funcs/squared_l2_norm.h
+++ b/paddle/phi/kernels/funcs/squared_l2_norm.h
@@ -18,10 +18,12 @@
 #include "paddle/phi/core/device_context.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #include "paddle/phi/kernels/primitive/functor_primitives.h"
 #ifdef __NVCC__
 #include "cub/cub.cuh"
+#elif defined(__MUSACC__)
+#include "cub/cub.cuh"
 #else
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
@@ -54,7 +56,7 @@ void SquaredL2Norm(const phi::CPUContext& ctx,
   }
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 template <typename T1, typename T2 = T1>
 void SquaredL2Norm(const phi::GPUContext& ctx,
                    const T1* x,
diff --git a/paddle/phi/kernels/funcs/top_k_function_cuda.h b/paddle/phi/kernels/funcs/top_k_function_cuda.h
index b6d6b0cffc667..595b751da1530 100644
--- a/paddle/phi/kernels/funcs/top_k_function_cuda.h
+++ b/paddle/phi/kernels/funcs/top_k_function_cuda.h
@@ -20,6 +20,9 @@ limitations under the License. */
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 #endif
diff --git a/paddle/phi/kernels/fusion/gpu/fused_bn_activation_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_bn_activation_grad_kernel.cu
index a27eb5149308e..feaa3662a64a7 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_bn_activation_grad_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_bn_activation_grad_kernel.cu
@@ -21,6 +21,10 @@
 #include "cub/cub.cuh"
 #endif
 
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
+
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_dnn.h"
 #include "paddle/phi/core/dense_tensor.h"
diff --git a/paddle/phi/kernels/fusion/gpu/fused_bn_activation_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_bn_activation_kernel.cu
index 700141f1e0331..8ae3570e5d3c5 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_bn_activation_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_bn_activation_kernel.cu
@@ -21,6 +21,10 @@
 #include "cub/cub.cuh"
 #endif
 
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
+
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_dnn.h"
 #include "paddle/phi/core/dense_tensor.h"
diff --git a/paddle/phi/kernels/gpu/.auc_kernel.cu.swp b/paddle/phi/kernels/gpu/.auc_kernel.cu.swp
new file mode 100644
index 0000000000000000000000000000000000000000..19911fda28cf641c42a36c0dc36934a46c22f777
GIT binary patch
literal 4096
zcmYc?2=nw+u+TGP00IFZzJ4&JB|(QN*_eT$C_ldhBuoexTAG_#nx~l?rkEw-Q>&km
zpPQ;*keHH^lNz5}TAT=F>lb8X>Sw1G<)!8n>!%l#>L->a$3ysf$))&A9F-dlfzc2k
zJ_L9fjExKpK<bs16cvPpLW$Qps&X_0MnhmU1V%$(Gz3ONU^E0qLtr!nMnho8hX4R{
CQ6UTf

literal 0
HcmV?d00001

diff --git a/paddle/phi/kernels/gpu/arg_min_max_kernel.cu b/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
index c42ad005c306c..7106f2e1230e4 100644
--- a/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
+++ b/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
@@ -17,11 +17,14 @@
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/phi/kernels/gpu/argsort_grad_kernel.cu b/paddle/phi/kernels/gpu/argsort_grad_kernel.cu
index b8d9df64c23ef..6ff4423ef6e9c 100644
--- a/paddle/phi/kernels/gpu/argsort_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/argsort_grad_kernel.cu
@@ -21,6 +21,9 @@
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/phi/kernels/gpu/argsort_kernel.cu b/paddle/phi/kernels/gpu/argsort_kernel.cu
index 5102594f98d1e..2d3516c952658 100644
--- a/paddle/phi/kernels/gpu/argsort_kernel.cu
+++ b/paddle/phi/kernels/gpu/argsort_kernel.cu
@@ -21,6 +21,9 @@
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/phi/kernels/gpu/auc_kernel.cu b/paddle/phi/kernels/gpu/auc_kernel.cu
index f733df24cf898..5fee7c93d8f05 100644
--- a/paddle/phi/kernels/gpu/auc_kernel.cu
+++ b/paddle/phi/kernels/gpu/auc_kernel.cu
@@ -231,6 +231,23 @@ void AucKernel(const Context &dev_ctx,
             sizeof(int64_t),
         cudaMemcpyDeviceToDevice);
   }
+#elif defined(PADDLE_WITH_MUSA)
+  if (stat_pos_in_tensor != stat_pos_out) {
+    musaMemcpy(
+        origin_stat_pos,
+        pos_in_data,
+        ((1 + slide_steps) * (num_thresholds + 1) + (slide_steps > 0 ? 1 : 0)) *
+            sizeof(int64_t),
+        musaMemcpyDeviceToDevice);
+  }
+  if (stat_neg_in_tensor != stat_neg_out) {
+    musaMemcpy(
+        origin_stat_neg,
+        neg_in_data,
+        ((1 + slide_steps) * (num_thresholds + 1) + (slide_steps > 0 ? 1 : 0)) *
+            sizeof(int64_t),
+        musaMemcpyDeviceToDevice);
+  }
 #else
   if (stat_pos_in_tensor != stat_pos_out) {
     hipMemcpy(
diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
index 7546ebbaf736c..43b7b4635378f 100644
--- a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
@@ -678,6 +678,8 @@ void BatchNormGradRawKernel(const Context &ctx,
 #ifdef PADDLE_WITH_HIP
 // TODO(wangran16): wait for MIOpen to improve the performance of BN
 // mode_ = miopenBNSpatial;
+#elif defined(PADDLE_WITH_MUSA)
+
 #elif CUDNN_VERSION_MIN(7, 0, 1)
     if (FLAGS_cudnn_batchnorm_spatial_persistent) {
       mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
@@ -795,6 +797,8 @@ void BatchNormGradRawKernel(const Context &ctx,
 //         d_bias->template mutable_data<BatchNormParamType<T>>(
 //             ctx.GetPlace()),
 //         epsilon, saved_mean_data, saved_var_data));
+#elif defined(PADDLE_WITH_MUSA)
+
 #else
     }
     // CUDNN only support small batch size
@@ -1390,6 +1394,20 @@ PD_REGISTER_KERNEL(batch_norm_grad,
                    float,
                    phi::dtype::float16) {}
 
+PD_REGISTER_KERNEL(batch_norm_grad_raw,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormGradRawKernel,
+                   float,
+                   phi::dtype::float16) {}
+#elif defined(PADDLE_WITH_MUSA)
+PD_REGISTER_KERNEL(batch_norm_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormGradKernel,
+                   float,
+                   phi::dtype::float16) {}
+
 PD_REGISTER_KERNEL(batch_norm_grad_raw,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
index 2a4a435f9c96a..7947477cab861 100644
--- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -15,6 +15,9 @@
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
@@ -1102,102 +1105,102 @@ void BatchNormKernel(const Context &ctx,
                   compute_inv_var_tensor.data<BatchNormParamType<T>>());
         }
       } else {
-#if CUDNN_VERSION_MIN(7, 4, 1)
-        size_t workspace_size = 0;
-        size_t reserve_space_size = 0;
-        void *reserve_space_ptr = nullptr;
-        void *workspace_ptr = nullptr;
-        DenseTensor workspace_tensor;
-        DenseTensor reserve_space_tensor;
-        // Create reserve space and workspace for batch norm.
-        // Create tensor for each batchnorm op, it will be used in the
-        // backward. Thus this tensor shouldn't be temp.
-        // auto *reserve_space = ctx.Output<phi::DenseTensor>("ReserveSpace");
-        if (reserve_space == nullptr) {
-          reserve_space = &reserve_space_tensor;
-        }
-        PADDLE_ENFORCE_NOT_NULL(
-            reserve_space,
-            phi::errors::NotFound(
-                "The argument ReserveSpace of batch_norm op is not found."));
-        // --------------- cudnn batchnorm workspace ---------------
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::
-                cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
-                    /*handle=*/handle,
-                    /*mode=*/mode_,
-                    /*bnIps=*/CUDNN_BATCHNORM_OPS_BN,
-                    /*xDesc=*/data_desc_,
-                    /*zDesc=*/nullptr,
-                    /*yDesc=*/data_desc_,
-                    /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
-                    /*activationDesc=*/nullptr,
-                    /*sizeInBytes=*/&workspace_size));
-
-        // -------------- cudnn batchnorm reserve space --------------
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
-                /*handle=*/handle,
-                /*mode=*/mode_,
-                /*bnOps=*/CUDNN_BATCHNORM_OPS_BN,
-                /*activationDesc=*/nullptr,
-                /*xDesc=*/data_desc_,
-                /*sizeInBytes=*/&reserve_space_size));
-
-        reserve_space->Resize({static_cast<int64_t>(reserve_space_size)});
-        reserve_space_ptr =
-            static_cast<void *>(ctx.template Alloc<uint8_t>(reserve_space));
-        workspace_tensor.Resize({static_cast<int64_t>(workspace_size)});
-        workspace_ptr =
-            static_cast<void *>(ctx.template Alloc<uint8_t>(&workspace_tensor));
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::cudnnBatchNormalizationForwardTrainingEx(
-                handle,
-                mode_,
-                CUDNN_BATCHNORM_OPS_BN,
-                CudnnDataType<T>::kOne(),
-                CudnnDataType<T>::kZero(),
-                data_desc_,
-                transformed_x.template data<T>(),
-                nullptr,
-                nullptr,
-                data_desc_,
-                transformed_y.template data<T>(),
-                bn_param_desc_,
-                scale.template data<BatchNormParamType<T>>(),
-                bias.template data<BatchNormParamType<T>>(),
-                this_factor,
-                ctx.template Alloc<BatchNormParamType<T>>(mean_out),
-                ctx.template Alloc<BatchNormParamType<T>>(variance_out),
-                epsilon,
-                ctx.template Alloc<BatchNormParamType<T>>(saved_mean),
-                ctx.template Alloc<BatchNormParamType<T>>(saved_variance),
-                nullptr,
-                workspace_ptr,
-                workspace_size,
-                reserve_space_ptr,
-                reserve_space_size));
-#else
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::cudnnBatchNormalizationForwardTraining(
-                handle,
-                mode_,
-                CudnnDataType<T>::kOne(),
-                CudnnDataType<T>::kZero(),
-                data_desc_,
-                transformed_x.template data<T>(),
-                data_desc_,
-                ctx.template Alloc<T>(&transformed_y),
-                bn_param_desc_,
-                scale.template data<BatchNormParamType<T>>(),
-                bias.template data<BatchNormParamType<T>>(),
-                this_factor,
-                ctx.template Alloc<BatchNormParamType<T>>(mean_out),
-                ctx.template Alloc<BatchNormParamType<T>>(variance_out),
-                epsilon,
-                ctx.template Alloc<BatchNormParamType<T>>(saved_mean),
-                ctx.template Alloc<BatchNormParamType<T>>(saved_variance)));
-#endif  // CUDNN_VERSION_MIN(7, 4, 1)
+//#if CUDNN_VERSION_MIN(7, 4, 1)
+//        size_t workspace_size = 0;
+//        size_t reserve_space_size = 0;
+//        void *reserve_space_ptr = nullptr;
+//        void *workspace_ptr = nullptr;
+//        DenseTensor workspace_tensor;
+//        DenseTensor reserve_space_tensor;
+//        // Create reserve space and workspace for batch norm.
+//        // Create tensor for each batchnorm op, it will be used in the
+//        // backward. Thus this tensor shouldn't be temp.
+//        // auto *reserve_space = ctx.Output<phi::DenseTensor>("ReserveSpace");
+//        if (reserve_space == nullptr) {
+//          reserve_space = &reserve_space_tensor;
+//        }
+//        PADDLE_ENFORCE_NOT_NULL(
+//            reserve_space,
+//            phi::errors::NotFound(
+//                "The argument ReserveSpace of batch_norm op is not found."));
+//        // --------------- cudnn batchnorm workspace ---------------
+//        PADDLE_ENFORCE_GPU_SUCCESS(
+//            phi::dynload::
+//                cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
+//                    /*handle=*/handle,
+//                    /*mode=*/mode_,
+//                    /*bnIps=*/CUDNN_BATCHNORM_OPS_BN,
+//                    /*xDesc=*/data_desc_,
+//                    /*zDesc=*/nullptr,
+//                    /*yDesc=*/data_desc_,
+//                    /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
+//                    /*activationDesc=*/nullptr,
+//                    /*sizeInBytes=*/&workspace_size));
+//
+//        // -------------- cudnn batchnorm reserve space --------------
+//        PADDLE_ENFORCE_GPU_SUCCESS(
+//            phi::dynload::cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
+//                /*handle=*/handle,
+//                /*mode=*/mode_,
+//                /*bnOps=*/CUDNN_BATCHNORM_OPS_BN,
+//                /*activationDesc=*/nullptr,
+//                /*xDesc=*/data_desc_,
+//                /*sizeInBytes=*/&reserve_space_size));
+//
+//        reserve_space->Resize({static_cast<int64_t>(reserve_space_size)});
+//        reserve_space_ptr =
+//            static_cast<void *>(ctx.template Alloc<uint8_t>(reserve_space));
+//        workspace_tensor.Resize({static_cast<int64_t>(workspace_size)});
+//        workspace_ptr =
+//            static_cast<void *>(ctx.template Alloc<uint8_t>(&workspace_tensor));
+//        PADDLE_ENFORCE_GPU_SUCCESS(
+//            phi::dynload::cudnnBatchNormalizationForwardTrainingEx(
+//                handle,
+//                mode_,
+//                CUDNN_BATCHNORM_OPS_BN,
+//                CudnnDataType<T>::kOne(),
+//                CudnnDataType<T>::kZero(),
+//                data_desc_,
+//                transformed_x.template data<T>(),
+//                nullptr,
+//                nullptr,
+//                data_desc_,
+//                transformed_y.template data<T>(),
+//                bn_param_desc_,
+//                scale.template data<BatchNormParamType<T>>(),
+//                bias.template data<BatchNormParamType<T>>(),
+//                this_factor,
+//                ctx.template Alloc<BatchNormParamType<T>>(mean_out),
+//                ctx.template Alloc<BatchNormParamType<T>>(variance_out),
+//                epsilon,
+//                ctx.template Alloc<BatchNormParamType<T>>(saved_mean),
+//                ctx.template Alloc<BatchNormParamType<T>>(saved_variance),
+//                nullptr,
+//                workspace_ptr,
+//                workspace_size,
+//                reserve_space_ptr,
+//                reserve_space_size));
+//#else
+//        PADDLE_ENFORCE_GPU_SUCCESS(
+//            phi::dynload::cudnnBatchNormalizationForwardTraining(
+//                handle,
+//                mode_,
+//                CudnnDataType<T>::kOne(),
+//                CudnnDataType<T>::kZero(),
+//                data_desc_,
+//                transformed_x.template data<T>(),
+//                data_desc_,
+//                ctx.template Alloc<T>(&transformed_y),
+//                bn_param_desc_,
+//                scale.template data<BatchNormParamType<T>>(),
+//                bias.template data<BatchNormParamType<T>>(),
+//                this_factor,
+//                ctx.template Alloc<BatchNormParamType<T>>(mean_out),
+//                ctx.template Alloc<BatchNormParamType<T>>(variance_out),
+//                epsilon,
+//                ctx.template Alloc<BatchNormParamType<T>>(saved_mean),
+//                ctx.template Alloc<BatchNormParamType<T>>(saved_variance)));
+//#endif  // CUDNN_VERSION_MIN(7, 4, 1)
       }
 #endif
     }
@@ -1249,28 +1252,28 @@ PD_REGISTER_KERNEL(batch_norm,
   kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
 }
 #else
-#if CUDNN_VERSION_MIN(8, 1, 0)
-PD_REGISTER_KERNEL(batch_norm,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::BatchNormKernel,
-                   float,
-                   double,
-                   phi::dtype::bfloat16,
-                   phi::dtype::float16) {
-  if (kernel_key.dtype() == phi::DataType::FLOAT16 ||
-      kernel_key.dtype() == phi::DataType::BFLOAT16) {
-    kernel->InputAt(1).SetDataType(phi::DataType::FLOAT32);
-    kernel->InputAt(2).SetDataType(phi::DataType::FLOAT32);
-    kernel->InputAt(3).SetDataType(phi::DataType::FLOAT32);
-    kernel->InputAt(4).SetDataType(phi::DataType::FLOAT32);
-    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
-    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
-    kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
-    kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
-  }
-}
-#else // CUDA & MUSA
+//#if CUDNN_VERSION_MIN(8, 1, 0)
+//PD_REGISTER_KERNEL(batch_norm,
+//                   GPU,
+//                   ALL_LAYOUT,
+//                   phi::BatchNormKernel,
+//                   float,
+//                   double,
+//                   phi::dtype::bfloat16,
+//                   phi::dtype::float16) {
+//  if (kernel_key.dtype() == phi::DataType::FLOAT16 ||
+//      kernel_key.dtype() == phi::DataType::BFLOAT16) {
+//    kernel->InputAt(1).SetDataType(phi::DataType::FLOAT32);
+//    kernel->InputAt(2).SetDataType(phi::DataType::FLOAT32);
+//    kernel->InputAt(3).SetDataType(phi::DataType::FLOAT32);
+//    kernel->InputAt(4).SetDataType(phi::DataType::FLOAT32);
+//    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+//    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+//    kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
+//    kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
+//  }
+//}
+//#else // CUDA & MUSA
 PD_REGISTER_KERNEL(batch_norm,
                    GPU,
                    ALL_LAYOUT,
@@ -1289,6 +1292,6 @@ PD_REGISTER_KERNEL(batch_norm,
     kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
   }
 }
-#endif
+//#endif
 
 #endif
diff --git a/paddle/phi/kernels/gpu/bernoulli_kernel.cu b/paddle/phi/kernels/gpu/bernoulli_kernel.cu
index 60e7b90e80135..53021d90f4247 100644
--- a/paddle/phi/kernels/gpu/bernoulli_kernel.cu
+++ b/paddle/phi/kernels/gpu/bernoulli_kernel.cu
@@ -17,6 +17,9 @@
 #ifdef __NVCC__
 #include <curand_kernel.h>
 #endif
+#ifdef __MUSACC__
+#include <murand_kernel.h>
+#endif
 #ifdef __HIPCC__
 #include <hiprand_kernel.h>
 #endif
diff --git a/paddle/phi/kernels/gpu/check_numerics_kernel.cu b/paddle/phi/kernels/gpu/check_numerics_kernel.cu
index 4b516b1074ba5..982c5794c56ed 100644
--- a/paddle/phi/kernels/gpu/check_numerics_kernel.cu
+++ b/paddle/phi/kernels/gpu/check_numerics_kernel.cu
@@ -399,6 +399,12 @@ static char* GetGpuHintStringPtr(const phi::GPUContext& ctx,
                                                 op_var.length() + 1,
                                                 hipMemcpyHostToDevice,
                                                 ctx.stream()));
+#elif defined(__MUSACC__)
+      PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpyAsync(gpu_str_ptr,
+                                                 iter->first.c_str(),
+                                                 op_var.length() + 1,
+                                                 musaMemcpyHostToDevice,
+                                                 ctx.stream()));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(gpu_str_ptr,
                                                  iter->first.c_str(),
diff --git a/paddle/phi/kernels/gpu/cholesky_kernel.cu b/paddle/phi/kernels/gpu/cholesky_kernel.cu
index 16e854c8de4c6..803c0ab6df585 100644
--- a/paddle/phi/kernels/gpu/cholesky_kernel.cu
+++ b/paddle/phi/kernels/gpu/cholesky_kernel.cu
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef PADDLE_WITH_HIP
-// HIP not support cusolver
+#ifdef PADDLE_WITH_CUDA
+// HIP and MUSA not support cusolver
 
 #include "paddle/phi/kernels/cholesky_kernel.h"
 
diff --git a/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu b/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu
index f350106f67cf8..d9065b5441713 100644
--- a/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu
+++ b/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef PADDLE_WITH_HIP
-// HIP not support cusolver
+#ifdef PADDLE_WITH_CUDA
+// HIP and MUSA not support cusolver
 
 #include "paddle/phi/backends/dynload/cusolver.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
diff --git a/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu b/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu
index 3a144b3ba7a40..ccb55eceaec71 100644
--- a/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu
@@ -17,6 +17,9 @@ limitations under the License. */
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
@@ -289,7 +292,15 @@ PD_REGISTER_KERNEL(cross_entropy_with_softmax_grad,
                    float,
                    double,
                    phi::dtype::float16) {}
-#else // CUDA & MUSA
+#elif defined(PADDLE_WITH_MUSA)
+PD_REGISTER_KERNEL(cross_entropy_with_softmax_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::CrossEntropyWithSoftmaxGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+#else // CUDA
 #if CUDNN_VERSION_MIN(8, 1, 0)
 PD_REGISTER_KERNEL(cross_entropy_with_softmax_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
index 3bd4595c48b21..37e85d9df2266 100644
--- a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
+++ b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
@@ -19,6 +19,9 @@ limitations under the License. */
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
@@ -1477,6 +1480,13 @@ void CrossEntropyWithSoftmaxKernel(const Context& dev_ctx,
 }  // namespace phi
 
 #ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(cross_entropy_with_softmax,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::CrossEntropyWithSoftmaxKernel,
+                   float,
+                   phi::dtype::float16) {}
+#elif defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(cross_entropy_with_softmax,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/cudnn_lstm_grad_kernel.cu b/paddle/phi/kernels/gpu/cudnn_lstm_grad_kernel.cu
index ff344fb47dcd6..d2512e5f2cc08 100644
--- a/paddle/phi/kernels/gpu/cudnn_lstm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/cudnn_lstm_grad_kernel.cu
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#if 0
 #include "paddle/phi/kernels/cudnn_lstm_grad_kernel.h"
 
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/gpu/cudnn_lstm_utils.h"
 
 namespace phi {
-
 template <typename T, typename Context>
 void CudnnLSTMGradKernel(
     const Context &ctx,
@@ -309,7 +309,6 @@ void CudnnLSTMGradKernel(
 }
 
 }  // namespace phi
-
 #ifdef PADDLE_WITH_HIP
 PD_REGISTER_KERNEL(
     cudnn_lstm_grad, GPU, ALL_LAYOUT, phi::CudnnLSTMGradKernel, float) {}
@@ -318,3 +317,4 @@ PD_REGISTER_KERNEL(
     cudnn_lstm_grad, GPU, ALL_LAYOUT, phi::CudnnLSTMGradKernel, float, double) {
 }
 #endif
+#endif
diff --git a/paddle/phi/kernels/gpu/cudnn_lstm_kernel.cu b/paddle/phi/kernels/gpu/cudnn_lstm_kernel.cu
index bcc1f1464bed1..37c7322309f8c 100644
--- a/paddle/phi/kernels/gpu/cudnn_lstm_kernel.cu
+++ b/paddle/phi/kernels/gpu/cudnn_lstm_kernel.cu
@@ -11,7 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
+#if 0
 #include "paddle/phi/kernels/cudnn_lstm_kernel.h"
 
 #include "glog/logging.h"
@@ -423,3 +423,4 @@ PD_REGISTER_KERNEL(
   kernel->OutputAt(4).SetDataType(phi::DataType::UINT8);
 }
 #endif
+#endif
diff --git a/paddle/phi/kernels/gpu/cum_grad_kernel.cu b/paddle/phi/kernels/gpu/cum_grad_kernel.cu
index d92dab27c8c15..1d19162cb146a 100644
--- a/paddle/phi/kernels/gpu/cum_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/cum_grad_kernel.cu
@@ -22,6 +22,9 @@
 #ifdef __NVCC__
 #include <cub/cub.cuh>
 #endif
+#ifdef __MUSACC__
+#include <cub/cub.cuh>
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/phi/kernels/gpu/cum_kernel.cu b/paddle/phi/kernels/gpu/cum_kernel.cu
index eb70b98f7ddca..a9896eba9113b 100644
--- a/paddle/phi/kernels/gpu/cum_kernel.cu
+++ b/paddle/phi/kernels/gpu/cum_kernel.cu
@@ -21,6 +21,9 @@
 #ifdef __NVCC__
 #include <cub/cub.cuh>
 #endif
+#ifdef __MUSACC__
+#include <cub/cub.cuh>
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
@@ -228,6 +231,8 @@ ThrustCumsumKernel(const Context& dev_ctx,
                    bool exclusive) {
 #ifdef __HIPCC__
   const auto& policy = thrust::hip::par.on(dev_ctx.stream());
+#elif defined(__MUSACC__)
+  const auto& policy = thrust::musa::par.on(dev_ctx.stream());
 #else
   const auto& policy = thrust::cuda::par.on(dev_ctx.stream());
 #endif
diff --git a/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu b/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu
index fdd9b4ba49914..c58baf23cac9c 100644
--- a/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu
@@ -181,6 +181,8 @@ void CumprodGradKernel(const Context &dev_ctx,
 // Step 1: find cummax-ed zero mask of x
 #ifdef PADDLE_WITH_CUDA
   const auto &exec_policy = thrust::cuda::par.on(dev_ctx.stream());
+#elif defined(PADDLE_WITH_MUSA)
+  const auto &exec_policy = thrust::musa::par.on(dev_ctx.stream());
 #else
   const auto &exec_policy = thrust::hip::par.on(dev_ctx.stream());
 #endif
diff --git a/paddle/phi/kernels/gpu/decode_jpeg_kernel.cu b/paddle/phi/kernels/gpu/decode_jpeg_kernel.cu
index 0b5a10b93d85a..54b4292997394 100644
--- a/paddle/phi/kernels/gpu/decode_jpeg_kernel.cu
+++ b/paddle/phi/kernels/gpu/decode_jpeg_kernel.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_HIP)
+#if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
 
 #include "paddle/phi/kernels/decode_jpeg_kernel.h"
 
diff --git a/paddle/phi/kernels/gpu/depthwise_conv.h b/paddle/phi/kernels/gpu/depthwise_conv.h
index 751a829d74634..51ba7a80df4ce 100644
--- a/paddle/phi/kernels/gpu/depthwise_conv.h
+++ b/paddle/phi/kernels/gpu/depthwise_conv.h
@@ -21,6 +21,9 @@ limitations under the License. */
 #ifdef __NVCC__
 #include <cub/cub.cuh>
 #endif
+#ifdef __MUSACC__
+#include <cub/cub.cuh>
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu b/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu
index 092d2428640c8..c3b9c0792963d 100644
--- a/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu
+++ b/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu
@@ -15,6 +15,9 @@
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/phi/kernels/gpu/generate_proposals_kernel.cu b/paddle/phi/kernels/gpu/generate_proposals_kernel.cu
index 38e0e27d99f14..d10de5579cd03 100644
--- a/paddle/phi/kernels/gpu/generate_proposals_kernel.cu
+++ b/paddle/phi/kernels/gpu/generate_proposals_kernel.cu
@@ -19,6 +19,9 @@
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/phi/kernels/gpu/graph_reindex_kernel.cu b/paddle/phi/kernels/gpu/graph_reindex_kernel.cu
index 966d018feb97f..d9d1149eb861e 100644
--- a/paddle/phi/kernels/gpu/graph_reindex_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_reindex_kernel.cu
@@ -23,6 +23,9 @@
 #ifdef __NVCC__
 #include <cub/cub.cuh>
 #endif
+#ifdef __MUSACC__
+#include <cub/cub.cuh>
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/phi/kernels/gpu/group_norm_utils.h b/paddle/phi/kernels/gpu/group_norm_utils.h
index 3ea5f3bc1088d..38ea2dbe64a07 100644
--- a/paddle/phi/kernels/gpu/group_norm_utils.h
+++ b/paddle/phi/kernels/gpu/group_norm_utils.h
@@ -17,6 +17,9 @@
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu b/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu
index aee591894cc81..eb7911c14ad7e 100644
--- a/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu
+++ b/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu
@@ -18,10 +18,13 @@
 #include "paddle/phi/kernels/funcs/axis_utils.h"
 #include "paddle/phi/kernels/impl/gumbel_softmax_kernel_impl.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
index 14be4ee79d142..3c8e4274474e1 100644
--- a/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
@@ -669,6 +669,21 @@ PD_REGISTER_KERNEL(instance_norm_double_grad,
                    phi::InstanceNormDoubleGradKernel,
                    float,
                    phi::dtype::float16) {}
+#elif defined(PADDLE_WITH_MUSA)
+PD_REGISTER_KERNEL(instance_norm_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::InstanceNormGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(instance_norm_double_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::InstanceNormDoubleGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
 #elif CUDNN_VERSION_MIN(8, 1, 0)
 PD_REGISTER_KERNEL(instance_norm_grad,
                    GPU,
@@ -686,7 +701,7 @@ PD_REGISTER_KERNEL(instance_norm_double_grad,
                    double,
                    phi::dtype::float16,
                    phi::dtype::bfloat16) {}
-#else // CUDA & MUSA
+#else // CUDA
 PD_REGISTER_KERNEL(instance_norm_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/instance_norm_kernel.cu b/paddle/phi/kernels/gpu/instance_norm_kernel.cu
index 51339ea33d36b..4a33ce8dafd1e 100644
--- a/paddle/phi/kernels/gpu/instance_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/instance_norm_kernel.cu
@@ -70,13 +70,7 @@ void InstanceNormKernel(const Context &dev_ctx,
   PADDLE_ENFORCE_GPU_SUCCESS(
       phi::dynload::miopenCreateTensorDescriptor(&in_param_desc_));
 #elif defined(PADDLE_WITH_MUSA)
-  mudnnTensorDescriptor_t data_desc_;
-  mudnnTensorDescriptor_t in_param_desc_;
 
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::mudnnCreateTensorDescriptor(&data_desc_));
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::mudnnCreateTensorDescriptor(&in_param_desc_));
 #else
   cudnnTensorDescriptor_t data_desc_;
   cudnnTensorDescriptor_t in_param_desc_;
@@ -109,14 +103,7 @@ void InstanceNormKernel(const Context &dev_ctx,
   PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDeriveBNTensorDescriptor(
       in_param_desc_, data_desc_, miopenBNSpatial));
 #elif defined(PADDLE_WITH_MUSA)
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnSetTensorDescriptor(
-      data_desc_,
-      CudnnDataType<T>::type,
-      x_dims.size() > 3 ? x_dims.size() : 4,
-      const_cast<int *>(dims.data()),
-      const_cast<int *>(strides.data())));
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnDeriveBNTensorDescriptor(
-      in_param_desc_, data_desc_, mudnnBNSpatial));
+
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensorNdDescriptor(
       data_desc_,
@@ -213,13 +200,9 @@ void InstanceNormKernel(const Context &dev_ctx,
       phi::dynload::miopenDestroyTensorDescriptor(data_desc_));
   PADDLE_ENFORCE_GPU_SUCCESS(
       phi::dynload::miopenDestroyTensorDescriptor(in_param_desc_));
-#else
+#elif defined(PADDLE_WITH_CUDA) 
   PADDLE_ENFORCE_GPU_SUCCESS(
-#ifdef PADDLE_WITH_MUSA
-      phi::dynload::mudnnBatchNormalizationForwardTraining(
-#else
       phi::dynload::cudnnBatchNormalizationForwardTraining(
-#endif
           handle,
           CUDNN_BATCHNORM_SPATIAL,
           CudnnDataType<T>::kOne(),
@@ -238,18 +221,11 @@ void InstanceNormKernel(const Context &dev_ctx,
           saved_mean_data,
           saved_variance_data));
 
-#ifdef PADDLE_WITH_MUSA
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::mudnnDestroyTensorDescriptor(data_desc_));
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::mudnnDestroyTensorDescriptor(in_param_desc_));
-#else
   PADDLE_ENFORCE_GPU_SUCCESS(
       phi::dynload::cudnnDestroyTensorDescriptor(data_desc_));
   PADDLE_ENFORCE_GPU_SUCCESS(
       phi::dynload::cudnnDestroyTensorDescriptor(in_param_desc_));
 #endif
-#endif
 }
 
 }  // namespace phi
@@ -262,6 +238,14 @@ PD_REGISTER_KERNEL(instance_norm,
                    phi::InstanceNormKernel,
                    float,
                    phi::dtype::float16) {}
+#elif defined(PADDLE_WITH_MUSA)
+PD_REGISTER_KERNEL(instance_norm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::InstanceNormKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
 #elif CUDNN_VERSION_MIN(8, 1, 0)
 PD_REGISTER_KERNEL(instance_norm,
                    GPU,
@@ -271,7 +255,7 @@ PD_REGISTER_KERNEL(instance_norm,
                    double,
                    phi::dtype::float16,
                    phi::dtype::bfloat16) {}
-#else // CUDA & MUSA
+#else // CUDA
 PD_REGISTER_KERNEL(instance_norm,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/instance_norm_utils.h b/paddle/phi/kernels/gpu/instance_norm_utils.h
index 865ab91da7b1b..feee37677e934 100644
--- a/paddle/phi/kernels/gpu/instance_norm_utils.h
+++ b/paddle/phi/kernels/gpu/instance_norm_utils.h
@@ -21,6 +21,9 @@
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/phi/kernels/gpu/mudnn_lstm_cache.h b/paddle/phi/kernels/gpu/mudnn_lstm_cache.h
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/paddle/phi/kernels/gpu/multinomial_kernel.cu b/paddle/phi/kernels/gpu/multinomial_kernel.cu
index effc963cd0a3d..1f0f4fa4c493c 100644
--- a/paddle/phi/kernels/gpu/multinomial_kernel.cu
+++ b/paddle/phi/kernels/gpu/multinomial_kernel.cu
@@ -21,6 +21,9 @@ limitations under the License. */
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/phi/kernels/gpu/nonzero_kernel.cu b/paddle/phi/kernels/gpu/nonzero_kernel.cu
index bc44f4f033c45..afb460bb97c42 100644
--- a/paddle/phi/kernels/gpu/nonzero_kernel.cu
+++ b/paddle/phi/kernels/gpu/nonzero_kernel.cu
@@ -15,6 +15,9 @@
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/phi/kernels/gpu/norm_grad_kernel.cu b/paddle/phi/kernels/gpu/norm_grad_kernel.cu
index ac29541326ec9..8d80365562a31 100644
--- a/paddle/phi/kernels/gpu/norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/norm_grad_kernel.cu
@@ -18,6 +18,9 @@
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/phi/kernels/gpu/norm_kernel.cu b/paddle/phi/kernels/gpu/norm_kernel.cu
index a933de0bffac3..ca1f6690f7c2e 100644
--- a/paddle/phi/kernels/gpu/norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/norm_kernel.cu
@@ -18,6 +18,9 @@
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/phi/kernels/gpu/poisson_kernel.cu b/paddle/phi/kernels/gpu/poisson_kernel.cu
index 302a9fe5ce581..03690d026aae7 100644
--- a/paddle/phi/kernels/gpu/poisson_kernel.cu
+++ b/paddle/phi/kernels/gpu/poisson_kernel.cu
@@ -15,6 +15,9 @@ limitations under the License. */
 #ifdef __NVCC__
 #include <curand_kernel.h>
 #endif
+#ifdef __MUSACC__
+#include <murand_kernel.h>
+#endif
 #ifdef __HIPCC__
 #include <hiprand_kernel.h>
 #endif
diff --git a/paddle/phi/kernels/gpu/randperm_kernel.cu b/paddle/phi/kernels/gpu/randperm_kernel.cu
index 2ad512701e097..629f72243e97e 100644
--- a/paddle/phi/kernels/gpu/randperm_kernel.cu
+++ b/paddle/phi/kernels/gpu/randperm_kernel.cu
@@ -17,6 +17,10 @@
 #ifdef __NVCC__
 #include <curand_kernel.h>
 
+#include "cub/cub.cuh"
+#endif
+#ifdef __MUSACC__
+#include <murand_kernel.h>
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
@@ -71,6 +75,11 @@ __global__ void SwapRepeatKernel(keyT* key_out_data,
   curand_init(seed, idx, offset, &state);
   for (int i = repeat_size - 1; i > 0; i--) {
     uint32_t r = curand(&state) % (i + 1);
+#elif defined(__MUSACC__)
+  murandStatePhilox4_32_10_t state;
+  murand_init(seed, idx, offset, &state);
+  for (int i = repeat_size - 1; i > 0; i--) {
+    uint32_t r = murand(&state) % (i + 1);
 #elif __HIPCC__
   hiprandStatePhilox4_32_10_t state;
   hiprand_init(seed, idx, offset, &state);
diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h
index 3962d86c3e7b9..422950b79fa3e 100644
--- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h
+++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h
@@ -28,6 +28,9 @@
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu b/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
index be630f85ce07d..e78c1293ee5c0 100644
--- a/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
+++ b/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
@@ -17,6 +17,9 @@
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/phi/kernels/impl/clip_grad_kernel_impl.h b/paddle/phi/kernels/impl/clip_grad_kernel_impl.h
index 821b065d2883a..705c8ee693414 100644
--- a/paddle/phi/kernels/impl/clip_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/clip_grad_kernel_impl.h
@@ -18,7 +18,7 @@
 #include "paddle/phi/common/transform.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/clip_kernel.h"
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #endif
 
@@ -47,7 +47,7 @@ void ClipGradKernel(const Context& dev_ctx,
   auto max_ = max.to<T>();
   auto min_ = min.to<T>();
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
   std::vector<const DenseTensor*> ins = {&out_grad, &x};
   std::vector<DenseTensor*> outs = {x_grad};
   auto functor = ClipGradFunctor<T>(min_, max_);
diff --git a/paddle/phi/kernels/impl/clip_kernel_impl.h b/paddle/phi/kernels/impl/clip_kernel_impl.h
index 7d327ef5c5dfa..e7324611ef340 100644
--- a/paddle/phi/kernels/impl/clip_kernel_impl.h
+++ b/paddle/phi/kernels/impl/clip_kernel_impl.h
@@ -18,7 +18,7 @@
 #include "paddle/phi/common/transform.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/clip_kernel.h"
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #endif
 
@@ -60,7 +60,7 @@ void ClipKernel(const Context& dev_ctx,
   const T* x_data = x.data<T>();
   int64_t numel = x.numel();
   if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) {
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
     std::vector<const DenseTensor*> ins = {&x};
     std::vector<DenseTensor*> outs = {out};
     auto functor = ClipFunctor<T>(min_, max_);
diff --git a/paddle/phi/kernels/impl/complex_kernel_impl.h b/paddle/phi/kernels/impl/complex_kernel_impl.h
index ebbbda04a01c2..2f3e8e2ed49be 100644
--- a/paddle/phi/kernels/impl/complex_kernel_impl.h
+++ b/paddle/phi/kernels/impl/complex_kernel_impl.h
@@ -88,7 +88,7 @@ void ComplexKernel(const Context& dev_ctx,
 
 // NOTE(chenfeiyu): be careful of the caveats of calling elementwise-related
 // facility functions
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
   phi::funcs::ElementwiseCompute<RealAndImagToComplexFunctor<T>, T, C>(
       dev_ctx, x, y, RealAndImagToComplexFunctor<T>(), out);
 #else
diff --git a/paddle/phi/kernels/impl/diag_embed_impl.h b/paddle/phi/kernels/impl/diag_embed_impl.h
index a4430fde92343..e182d15167aaf 100644
--- a/paddle/phi/kernels/impl/diag_embed_impl.h
+++ b/paddle/phi/kernels/impl/diag_embed_impl.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 #endif
@@ -105,7 +105,7 @@ void DiagEmbedKernel(const Context& dev_ctx,
   strides.push_back(stride[dim1_] + stride[dim2_]);
   const auto dims = vectorize(x.dims());
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
   thrust::device_vector<int64_t> dims_vec(dims);
   const int64_t* dims_arr = thrust::raw_pointer_cast(dims_vec.data());
   thrust::device_vector<int64_t> strides_vec(strides);
diff --git a/paddle/phi/kernels/impl/dot_grad_kernel_impl.h b/paddle/phi/kernels/impl/dot_grad_kernel_impl.h
index add72749d39e1..70960808c4ccc 100644
--- a/paddle/phi/kernels/impl/dot_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/dot_grad_kernel_impl.h
@@ -45,7 +45,7 @@ struct DotGradFunction<DeviceContext, T, phi::funcs::EnableComplex<T>> {
                   DenseTensor* tensor_dx,
                   DenseTensor* tensor_dy) {
     VLOG(1) << "enable route";
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
     if (1 >= tensor_dout->dims().size()) {
       auto dout = EigenVector<T>::Flatten(*tensor_dout);
 
@@ -143,7 +143,7 @@ struct DotGradFunction<DeviceContext, T, phi::funcs::DisableComplex<T>> {
                   const DenseTensor* tensor_dout,
                   DenseTensor* tensor_dx,
                   DenseTensor* tensor_dy) {
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
     if (1 >= tensor_dout->dims().size()) {
       auto dout = EigenVector<T>::Flatten(*tensor_dout);
       if (tensor_dx) {
@@ -235,7 +235,7 @@ struct DotDoubleGradFunction<DeviceContext, T, phi::funcs::EnableComplex<T>> {
                   DenseTensor* tensor_ddout) {
     const DenseTensor* tensor_ddx = tensor_ddx_opt->get_ptr();
     const DenseTensor* tensor_ddy = tensor_ddy_opt->get_ptr();
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
     if (1 >= tensor_dout->dims().size()) {
       DenseTensor tensor_dout_help;
       auto& dev = *ctx.eigen_device();
@@ -430,7 +430,7 @@ struct DotDoubleGradFunction<DeviceContext, T, phi::funcs::DisableComplex<T>> {
                   DenseTensor* tensor_ddout) {
     const DenseTensor* tensor_ddx = tensor_ddx_opt->get_ptr();
     const DenseTensor* tensor_ddy = tensor_ddy_opt->get_ptr();
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
     if (1 >= tensor_dout->dims().size()) {
       auto& dev = *ctx.eigen_device();
       auto x = EigenVector<T>::Flatten(*tensor_x);
@@ -620,7 +620,7 @@ struct DotTripleGradFunction<DeviceContext, T, phi::funcs::EnableComplex<T>> {
     const DenseTensor* in_tensor_d_dx = in_tensor_d_dx_opt->get_ptr();
     const DenseTensor* in_tensor_d_dy = in_tensor_d_dy_opt->get_ptr();
     const DenseTensor* in_tensor_d_ddout = in_tensor_d_ddout_opt->get_ptr();
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
     if (1 >= in_tensor_dout->dims().size()) {
       auto& dev = *ctx.eigen_device();
       DenseTensor in_tensor_x_help = Conj<T, DeviceContext>(ctx, *in_tensor_x);
@@ -1014,7 +1014,7 @@ struct DotTripleGradFunction<DeviceContext, T, phi::funcs::DisableComplex<T>> {
     const DenseTensor* in_tensor_d_dx = in_tensor_d_dx_opt->get_ptr();
     const DenseTensor* in_tensor_d_dy = in_tensor_d_dy_opt->get_ptr();
     const DenseTensor* in_tensor_d_ddout = in_tensor_d_ddout_opt->get_ptr();
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
     if (1 >= in_tensor_dout->dims().size()) {
       auto& dev = *ctx.eigen_device();
       bool d_dout_flag = false;
diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
index 280c38633b462..a79d7c129975b 100644
--- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
@@ -903,7 +903,7 @@ void HeavisideGradKernel(const Context& dev_ctx,
           HeavisideGradDy<T>());
 }
 
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__)
 template <typename T, typename MPType>
 HOSTDEVICE typename std::enable_if<std::is_integral<T>::value, T>::type
 compute_pow_grad_dx(T x, T y, T out, T dout) {
diff --git a/paddle/phi/kernels/impl/elementwise_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_kernel_impl.h
index 0121f35b3cecb..742646ba05730 100644
--- a/paddle/phi/kernels/impl/elementwise_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_kernel_impl.h
@@ -17,7 +17,7 @@
 #include "paddle/phi/kernels/elementwise_kernel.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || defined(__xpu__)
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #endif
 
diff --git a/paddle/phi/kernels/impl/fft_grad_kernel_impl.h b/paddle/phi/kernels/impl/fft_grad_kernel_impl.h
index de4bb8d4bd173..58801643692b5 100644
--- a/paddle/phi/kernels/impl/fft_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/fft_grad_kernel_impl.h
@@ -94,7 +94,7 @@ void FFTC2RGradKernel(const Context& ctx,
       out_grad.dims()[axes.back()] - x_grad->dims()[axes.back()];
   const phi::DDim strides = phi::stride(x_grad->dims());
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
   const thrust::device_vector<int64_t> strides_g(phi::vectorize(strides));
   const int64_t* pstrides = thrust::raw_pointer_cast(strides_g.data());
 #else
diff --git a/paddle/phi/kernels/impl/isclose_kernel_impl.h b/paddle/phi/kernels/impl/isclose_kernel_impl.h
index f74094184e33f..5530dd22d37d7 100644
--- a/paddle/phi/kernels/impl/isclose_kernel_impl.h
+++ b/paddle/phi/kernels/impl/isclose_kernel_impl.h
@@ -86,7 +86,7 @@ struct IscloseFunctor<phi::CPUContext, T> {
   }
 };
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 template <typename T>
 __global__ void IscloseCUDAKernel(const T* in_data,
                                   const T* other_data,
diff --git a/paddle/phi/kernels/impl/kron_grad_kernel_impl.h b/paddle/phi/kernels/impl/kron_grad_kernel_impl.h
index 4829ae0a9f0c9..f1e7551e4ec0c 100644
--- a/paddle/phi/kernels/impl/kron_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/kron_grad_kernel_impl.h
@@ -177,7 +177,7 @@ struct KronGradOpFunctor {
     const int64_t *p_stride_y = nullptr;
     const int64_t *p_stride_dout = nullptr;
     const int64_t *p_shape_y = nullptr;
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
     thrust::device_vector<int64_t> d_stride_x(ndims);
     thrust::device_vector<int64_t> d_stride_y(ndims);
     thrust::device_vector<int64_t> d_stride_dout(ndims);
@@ -231,7 +231,7 @@ struct KronGradOpFunctor {
     for_range(func);
 
 // reduce_sum along aixs 1
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
     auto stream = dev_ctx.stream();  // it is a cuda device_context
     if (dx) {
       funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
diff --git a/paddle/phi/kernels/impl/kron_kernel_impl.h b/paddle/phi/kernels/impl/kron_kernel_impl.h
index e1fcb49949a74..3e31848e8e619 100644
--- a/paddle/phi/kernels/impl/kron_kernel_impl.h
+++ b/paddle/phi/kernels/impl/kron_kernel_impl.h
@@ -20,7 +20,7 @@
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #include "paddle/phi/kernels/funcs/reduce_function.h"
 #include "thrust/device_vector.h"
 #endif
@@ -117,7 +117,7 @@ struct KronOpFunctor {
 
     const int64_t *p_stride_x = nullptr, *p_stride_y = nullptr,
                   *p_stride_out = nullptr, *p_shape_y = nullptr;
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
     thrust::device_vector<int64_t> d_stride_x(ndims);
     thrust::device_vector<int64_t> d_stride_y(ndims);
     thrust::device_vector<int64_t> d_stride_out(ndims);
diff --git a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
index 885827a36beab..fc50eb0626fdb 100644
--- a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
@@ -26,7 +26,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/impl/dot_grad_kernel_impl.h"
 #include "paddle/phi/kernels/impl/matmul_kernel_impl.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #include "paddle/phi/kernels/gpu/reduce.h"
 #endif
 
@@ -53,7 +53,7 @@ struct ReduceSumForMatmulGrad<CPUContext, T> {
   }
 };
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 template <typename T>
 struct ReduceSumForMatmulGrad<GPUContext, T> {
   void operator()(const GPUContext& dev_ctx,
diff --git a/paddle/phi/kernels/impl/polygamma_kernel_impl.h b/paddle/phi/kernels/impl/polygamma_kernel_impl.h
index 8b4274b0882c8..1a692a8ed594f 100644
--- a/paddle/phi/kernels/impl/polygamma_kernel_impl.h
+++ b/paddle/phi/kernels/impl/polygamma_kernel_impl.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 #else
 #include "paddle/phi/kernels/funcs/for_range.h"
@@ -25,7 +25,7 @@ limitations under the License. */
 
 namespace phi {
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 template <typename T>
 __host__ __device__ T zeta(T x, T q) {
   /*
diff --git a/paddle/phi/kernels/impl/renorm_impl.h b/paddle/phi/kernels/impl/renorm_impl.h
index d206e29a69a6d..60e4f5ff692f1 100644
--- a/paddle/phi/kernels/impl/renorm_impl.h
+++ b/paddle/phi/kernels/impl/renorm_impl.h
@@ -17,11 +17,13 @@
 #include "paddle/phi/core/device_context.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #include "paddle/phi/kernels/funcs/reduce_function.h"
 #include "paddle/phi/kernels/primitive/functor_primitives.h"
 #ifdef __NVCC__
 #include "cub/cub.cuh"
+#elif defined(__MUSACC__)
+#include "cub/cub.cuh"
 #else
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
@@ -149,7 +151,7 @@ void RenormGradFunc(const phi::CPUContext& ctx UNUSED,
   }
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 __device__ __forceinline__ float inline_pow(float base, float exponent) {
   return pow(base, exponent);
 }
diff --git a/paddle/phi/kernels/impl/repeat_interleave_grad_kernel_impl.h b/paddle/phi/kernels/impl/repeat_interleave_grad_kernel_impl.h
index 806e2be66332c..758d9ff3d0c50 100644
--- a/paddle/phi/kernels/impl/repeat_interleave_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/repeat_interleave_grad_kernel_impl.h
@@ -18,11 +18,13 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/cpu/index_select_impl.h"
 #include "paddle/phi/kernels/repeat_interleave_grad_kernel.h"
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/kernels/primitive/functor_primitives.h"
 #ifdef __NVCC__
 #include "cub/cub.cuh"
+#elif defined(__NVCC__)
+#include "cub/cub.cuh"
 #else
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
@@ -33,7 +35,7 @@ namespace cub = hipcub;
 
 namespace phi {
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 using phi::PADDLE_CUDA_NUM_THREADS;
 
 template <typename T, typename IndexT>
@@ -104,7 +106,7 @@ void RepeatInterleaveWithTensorIndexGradKernel(
                         DataTypeToString(index_type),
                         DataTypeToString(DataType::INT32),
                         DataTypeToString(DataType::INT64)));
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 
   auto output_dim = out_grad.dims();
   auto stride_dim = phi::stride(input_dim);
@@ -179,7 +181,7 @@ void RepeatInterleaveGradKernel(const Context& ctx,
   }
 
   DenseTensor index;
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
   auto output_dim = out_grad.dims();
   auto stride_dim = phi::stride(input_dim);
   int64_t stride = stride_dim[dim];
diff --git a/paddle/phi/kernels/impl/repeat_interleave_kernel_impl.h b/paddle/phi/kernels/impl/repeat_interleave_kernel_impl.h
index b605081064008..0bcc1fa3f432e 100644
--- a/paddle/phi/kernels/impl/repeat_interleave_kernel_impl.h
+++ b/paddle/phi/kernels/impl/repeat_interleave_kernel_impl.h
@@ -17,7 +17,7 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/cpu/index_select_impl.h"
 #include "paddle/phi/kernels/repeat_interleave_kernel.h"
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #include "paddle/phi/backends/gpu/gpu_decls.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
@@ -29,7 +29,7 @@
 
 namespace phi {
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 using phi::PADDLE_CUDA_NUM_THREADS;
 template <typename T, typename IndexT>
 __global__ void index_select_cuda_kernel(const T* input,
@@ -81,7 +81,7 @@ void RepeatInterleaveKernel(const Context& ctx,
     output_dim[dim] = index_size;
     out->Resize(phi::make_ddim(output_dim));
     phi::IndexSelectInner<Context, T, int>(ctx, &x_copy, index, out, dim);
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
   } else {
     auto stride_dim = phi::stride(input_dim);
     int64_t stride = stride_dim[dim];
@@ -160,7 +160,7 @@ void RepeatInterleaveWithTensorIndexKernel(const Context& ctx,
       out->Resize(phi::make_ddim(output_dim));
       IndexSelectInner<Context, T, int64_t>(ctx, &x_copy, index, out, dim);
     }
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
   } else {
     auto stride_dim = phi::stride(input_dim);
     int64_t stride = stride_dim[dim];
diff --git a/paddle/phi/kernels/impl/sequence_mask_kernel_impl.h b/paddle/phi/kernels/impl/sequence_mask_kernel_impl.h
index 80834fae85411..fe39f807be8f6 100644
--- a/paddle/phi/kernels/impl/sequence_mask_kernel_impl.h
+++ b/paddle/phi/kernels/impl/sequence_mask_kernel_impl.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #include <thrust/device_ptr.h>
 #include <thrust/functional.h>
 #include <thrust/reduce.h>
@@ -64,7 +64,7 @@ void SequenceMaskKernel(const Context& ctx,
     if (x_numel == 0) {
       maxlen = 0;
     } else {
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
       VLOG(10)
           << "SequenceMaskOp on GPU may be slow when maxlen is not provided.";
       maxlen = static_cast<int>(
diff --git a/paddle/phi/kernels/impl/solve_grad_kernel_impl.h b/paddle/phi/kernels/impl/solve_grad_kernel_impl.h
index a661035ab5b74..dce2c60ed1937 100644
--- a/paddle/phi/kernels/impl/solve_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/solve_grad_kernel_impl.h
@@ -26,7 +26,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/squeeze_kernel.h"
 #include "paddle/phi/kernels/unsqueeze_kernel.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #include "paddle/phi/kernels/gpu/reduce.h"
 #endif
 
@@ -55,7 +55,7 @@ struct ReduceSumForSolvelGrad<CPUContext, T> {
   }
 };
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 template <typename T>
 struct ReduceSumForSolvelGrad<GPUContext, T> {
   void operator()(const GPUContext& dev_ctx,
diff --git a/paddle/phi/kernels/impl/trace_grad_kernel_impl.h b/paddle/phi/kernels/impl/trace_grad_kernel_impl.h
index 640fd07a92a2b..b153068ab11c1 100644
--- a/paddle/phi/kernels/impl/trace_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/trace_grad_kernel_impl.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 #endif
@@ -120,7 +120,7 @@ void TraceGradKernel(const Context& ctx,
   int64_t diag_size = len2 < len1 ? len2 : len1;
   int64_t pos = std::abs(offset) * offset_stride;
   if (diag_size > 0) {
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
     thrust::device_vector<int64_t> output_vec(vectorize(output_stride));
     const int64_t* output_arr = thrust::raw_pointer_cast(output_vec.data());
     thrust::device_vector<int64_t> input_vec(vectorize(input_stride));
diff --git a/paddle/phi/kernels/impl/unstack_grad_kernel_impl.h b/paddle/phi/kernels/impl/unstack_grad_kernel_impl.h
index 0576742e349a8..a370ce2cf43c6 100644
--- a/paddle/phi/kernels/impl/unstack_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/unstack_grad_kernel_impl.h
@@ -16,7 +16,7 @@
 
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/stack_functor.h"
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #include <thrust/device_vector.h>
 #endif
 namespace phi {
@@ -39,7 +39,7 @@ void UnStackGradKernel(const Context &dev_ctx,
   for (auto i = 0; i < axis; ++i) pre *= dim[i];
   for (auto i = axis; i < dim.size(); ++i) post *= dim[i];
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
   int total_num = pre * n * post;
 
   thrust::device_vector<const T *> device_x_vec(x_datas);
diff --git a/paddle/phi/kernels/impl/unstack_kernel_impl.h b/paddle/phi/kernels/impl/unstack_kernel_impl.h
index 102126a1e3307..fd2ecdf6fa6bf 100644
--- a/paddle/phi/kernels/impl/unstack_kernel_impl.h
+++ b/paddle/phi/kernels/impl/unstack_kernel_impl.h
@@ -16,7 +16,7 @@
 
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/stack_functor.h"
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #include <thrust/device_vector.h>
 #endif
 
@@ -44,7 +44,7 @@ void UnStackKernel(const Context &dev_ctx,
   int total_num = dy->numel();
   int post = total_num / (n * pre);
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
   thrust::device_vector<T *> device_dx_vec(dx_datas);
   auto dx_data_arr = device_dx_vec.data().get();
 #else
@@ -52,7 +52,7 @@ void UnStackKernel(const Context &dev_ctx,
 #endif
   phi::funcs::StackGradFunctorForRange(
       dev_ctx, dx_data_arr, dy_data, total_num, n, post);
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
   // Wait() must be called because device_dx_vec may be destructed before
   // kernel ends
   dev_ctx.Wait();
diff --git a/paddle/phi/kernels/is_empty_kernel.cc b/paddle/phi/kernels/is_empty_kernel.cc
index f420a419f5c67..674fb11a1b472 100644
--- a/paddle/phi/kernels/is_empty_kernel.cc
+++ b/paddle/phi/kernels/is_empty_kernel.cc
@@ -43,7 +43,7 @@ PD_REGISTER_KERNEL(is_empty,
   kernel->OutputAt(0).SetDataType(phi::DataType::BOOL);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 PD_REGISTER_KERNEL(is_empty,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/memcpy_kernel.cc b/paddle/phi/kernels/memcpy_kernel.cc
index 62a6cbc8ea840..d5dc8a5fe23f9 100644
--- a/paddle/phi/kernels/memcpy_kernel.cc
+++ b/paddle/phi/kernels/memcpy_kernel.cc
@@ -162,7 +162,7 @@ PD_REGISTER_KERNEL_FOR_ALL_DTYPE(memcpy,
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 PD_REGISTER_KERNEL_FOR_ALL_DTYPE(memcpy_h2d,
                                  GPU,
                                  ALL_LAYOUT,
diff --git a/paddle/phi/kernels/npu_identity_kernel.cc b/paddle/phi/kernels/npu_identity_kernel.cc
index 12d933af78733..341444fa7a32e 100644
--- a/paddle/phi/kernels/npu_identity_kernel.cc
+++ b/paddle/phi/kernels/npu_identity_kernel.cc
@@ -62,7 +62,7 @@ PD_REGISTER_KERNEL(npu_identity,
                    bool,
                    phi::dtype::float16) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 PD_REGISTER_KERNEL(npu_identity,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/primitive/compute_primitives.h b/paddle/phi/kernels/primitive/compute_primitives.h
index 30c2636a2bde9..cf0e314bc8edc 100644
--- a/paddle/phi/kernels/primitive/compute_primitives.h
+++ b/paddle/phi/kernels/primitive/compute_primitives.h
@@ -17,6 +17,9 @@
 #ifdef PADDLE_WITH_CUDA
 #include <cuda_fp16.h>
 #endif
+#ifdef PADDLE_WITH_MUSA
+#include <musa_fp16.h>
+#endif
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_fp16.h>
 #endif
diff --git a/paddle/phi/kernels/prod_kernel.cc b/paddle/phi/kernels/prod_kernel.cc
index 4e5546ca0df01..fb44e97af1768 100644
--- a/paddle/phi/kernels/prod_kernel.cc
+++ b/paddle/phi/kernels/prod_kernel.cc
@@ -40,7 +40,7 @@ PD_REGISTER_KERNEL(prod_infer,
                    int,
                    int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 PD_REGISTER_KERNEL(prod_infer,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/reduce_all_kernel.cc b/paddle/phi/kernels/reduce_all_kernel.cc
index 3b33c7f665e79..9d49240424ff0 100644
--- a/paddle/phi/kernels/reduce_all_kernel.cc
+++ b/paddle/phi/kernels/reduce_all_kernel.cc
@@ -40,7 +40,7 @@ void AllKernel(const Context& dev_ctx,
 
 PD_REGISTER_KERNEL(all, CPU, ALL_LAYOUT, phi::AllKernel, bool) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 PD_REGISTER_KERNEL(all, GPU, ALL_LAYOUT, phi::AllKernel, bool) {}
 #endif
 
diff --git a/paddle/phi/kernels/reduce_amax_kernel.cc b/paddle/phi/kernels/reduce_amax_kernel.cc
index 466d0497b2d8e..f6f870dc59ed4 100644
--- a/paddle/phi/kernels/reduce_amax_kernel.cc
+++ b/paddle/phi/kernels/reduce_amax_kernel.cc
@@ -34,7 +34,7 @@ void AMaxKernel(const Context& dev_ctx,
 PD_REGISTER_KERNEL(
     amax, CPU, ALL_LAYOUT, phi::AMaxKernel, float, double, int, int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 PD_REGISTER_KERNEL(
     amax, GPU, ALL_LAYOUT, phi::AMaxKernel, float, double, int, int64_t) {}
 #endif
diff --git a/paddle/phi/kernels/reduce_amin_kernel.cc b/paddle/phi/kernels/reduce_amin_kernel.cc
index a30ab4a91956d..da6ce3152586d 100644
--- a/paddle/phi/kernels/reduce_amin_kernel.cc
+++ b/paddle/phi/kernels/reduce_amin_kernel.cc
@@ -34,7 +34,7 @@ void AMinKernel(const Context& dev_ctx,
 PD_REGISTER_KERNEL(
     amin, CPU, ALL_LAYOUT, phi::AMinKernel, float, double, int, int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 PD_REGISTER_KERNEL(
     amin, GPU, ALL_LAYOUT, phi::AMinKernel, float, double, int, int64_t) {}
 #endif
diff --git a/paddle/phi/kernels/reduce_any_kernel.cc b/paddle/phi/kernels/reduce_any_kernel.cc
index 0b6f4028b62ac..02a16da79dc99 100644
--- a/paddle/phi/kernels/reduce_any_kernel.cc
+++ b/paddle/phi/kernels/reduce_any_kernel.cc
@@ -33,7 +33,7 @@ void AnyKernel(const Context& dev_ctx,
 
 PD_REGISTER_KERNEL(any, CPU, ALL_LAYOUT, phi::AnyKernel, bool) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 PD_REGISTER_KERNEL(any, GPU, ALL_LAYOUT, phi::AnyKernel, bool) {}
 #endif
 
diff --git a/paddle/phi/kernels/reduce_mean_kernel.cc b/paddle/phi/kernels/reduce_mean_kernel.cc
index fb8ea2f97bbea..4145ad1ed92fd 100644
--- a/paddle/phi/kernels/reduce_mean_kernel.cc
+++ b/paddle/phi/kernels/reduce_mean_kernel.cc
@@ -41,7 +41,7 @@ PD_REGISTER_KERNEL(mean,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 PD_REGISTER_KERNEL(mean,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/reduce_min_kernel.cc b/paddle/phi/kernels/reduce_min_kernel.cc
index c5219c32cb743..dd182f6f7c02f 100644
--- a/paddle/phi/kernels/reduce_min_kernel.cc
+++ b/paddle/phi/kernels/reduce_min_kernel.cc
@@ -57,8 +57,8 @@ PD_REGISTER_KERNEL(
     min, GPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t) {}
 #endif
 
-#if defined(PADDLE_WITH_MUSA)
-PD_REGISTER_KERNEL()
+#if defined(PADDLE_WITH_MUSAAA)
+PD_REGISTER_KERNEL(
     min, GPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t, phi::dtype::float16) {}
 #endif
 
diff --git a/paddle/phi/kernels/reduce_sum_kernel.cc b/paddle/phi/kernels/reduce_sum_kernel.cc
index 59d192014da1d..184522e4fd2b8 100644
--- a/paddle/phi/kernels/reduce_sum_kernel.cc
+++ b/paddle/phi/kernels/reduce_sum_kernel.cc
@@ -53,7 +53,7 @@ PD_REGISTER_KERNEL(sum,
   kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 PD_REGISTER_KERNEL(sum,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/reverse_kernel.cc b/paddle/phi/kernels/reverse_kernel.cc
index d8c8f5a966376..d906316e45974 100644
--- a/paddle/phi/kernels/reverse_kernel.cc
+++ b/paddle/phi/kernels/reverse_kernel.cc
@@ -61,7 +61,7 @@ PD_REGISTER_KERNEL(reverse_array,
                    float,
                    double) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 
 PD_REGISTER_KERNEL(reverse_array,
                    GPU,
diff --git a/paddle/phi/kernels/selected_rows/activation_kernel.cc b/paddle/phi/kernels/selected_rows/activation_kernel.cc
index 6bd55f701bb33..a886e9d1eea25 100644
--- a/paddle/phi/kernels/selected_rows/activation_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/activation_kernel.cc
@@ -49,7 +49,7 @@ PD_REGISTER_KERNEL(
 PD_REGISTER_KERNEL(
     sqrt_sr, CPU, ALL_LAYOUT, phi::sr::SqrtKernel, float, double) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 
 PD_REGISTER_KERNEL(square_sr,
                    GPU,
diff --git a/paddle/phi/kernels/selected_rows/assign_kernel.cc b/paddle/phi/kernels/selected_rows/assign_kernel.cc
index 481f5f6fcf852..26136da2da26a 100644
--- a/paddle/phi/kernels/selected_rows/assign_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/assign_kernel.cc
@@ -41,7 +41,7 @@ PD_REGISTER_KERNEL_FOR_ALL_DTYPE(assign_sr,
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 PD_REGISTER_KERNEL_FOR_ALL_DTYPE(assign_sr,
                                  GPU,
                                  ALL_LAYOUT,
diff --git a/paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.cc b/paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.cc
index 0ea7fbe8857c4..48ff1e42ba706 100644
--- a/paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.cc
@@ -84,7 +84,7 @@ PD_REGISTER_KERNEL(multiply_sr,
                    complex64,
                    complex128) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 PD_REGISTER_KERNEL(multiply_raw_sr,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/selected_rows/full_kernel.cc b/paddle/phi/kernels/selected_rows/full_kernel.cc
index b593e6db3f936..215540ca617ad 100644
--- a/paddle/phi/kernels/selected_rows/full_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/full_kernel.cc
@@ -54,7 +54,7 @@ PD_REGISTER_KERNEL(full_sr,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 PD_REGISTER_KERNEL(full_sr,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/selected_rows/isfinite_kernel.cc b/paddle/phi/kernels/selected_rows/isfinite_kernel.cc
index e3489f50e2184..d08c11129e236 100644
--- a/paddle/phi/kernels/selected_rows/isfinite_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/isfinite_kernel.cc
@@ -51,7 +51,7 @@ PD_REGISTER_KERNEL(isfinite_sr,
                    int,
                    int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 PD_REGISTER_KERNEL(isinf_sr,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/selected_rows/merge_selected_rows_kernel.cc b/paddle/phi/kernels/selected_rows/merge_selected_rows_kernel.cc
index 7b6f7e9ceefa4..0509d2f791829 100644
--- a/paddle/phi/kernels/selected_rows/merge_selected_rows_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/merge_selected_rows_kernel.cc
@@ -41,7 +41,7 @@ PD_REGISTER_KERNEL(merge_selected_rows,
                    float,
                    double) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 PD_REGISTER_KERNEL(merge_selected_rows,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/selected_rows/scale_kernel.cc b/paddle/phi/kernels/selected_rows/scale_kernel.cc
index f6f9d587c4022..41c48c682757f 100644
--- a/paddle/phi/kernels/selected_rows/scale_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/scale_kernel.cc
@@ -54,7 +54,7 @@ PD_REGISTER_KERNEL(scale_sr,
                    int,
                    int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 PD_REGISTER_KERNEL(scale_sr,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/selected_rows/shape_kernel.cc b/paddle/phi/kernels/selected_rows/shape_kernel.cc
index 0a07bee7b6974..b520b1a80536a 100644
--- a/paddle/phi/kernels/selected_rows/shape_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/shape_kernel.cc
@@ -52,7 +52,7 @@ PD_REGISTER_KERNEL(shape_sr,
   kernel->OutputAt(0).SetDataType(phi::DataType::INT32);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 PD_REGISTER_KERNEL(shape_sr,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/selected_rows/uniform_kernel.cc b/paddle/phi/kernels/selected_rows/uniform_kernel.cc
index 90bee1744e962..ebcd59e0c11da 100644
--- a/paddle/phi/kernels/selected_rows/uniform_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/uniform_kernel.cc
@@ -77,7 +77,7 @@ PD_REGISTER_KERNEL(uniform_sr,
                    double,
                    phi::dtype::bfloat16) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 
 PD_REGISTER_KERNEL(uniform_raw_sr,
                    GPU,
diff --git a/paddle/phi/kernels/shape_kernel.cc b/paddle/phi/kernels/shape_kernel.cc
index e7556d1401954..a59823c2d788d 100644
--- a/paddle/phi/kernels/shape_kernel.cc
+++ b/paddle/phi/kernels/shape_kernel.cc
@@ -51,7 +51,7 @@ PD_REGISTER_KERNEL(shape,
   kernel->OutputAt(0).SetDataType(phi::DataType::INT32);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 PD_REGISTER_KERNEL(shape,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/sparse/batch_norm_grad_kernel.cc b/paddle/phi/kernels/sparse/batch_norm_grad_kernel.cc
index 4bd01b667516b..37838bd02728b 100644
--- a/paddle/phi/kernels/sparse/batch_norm_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/batch_norm_grad_kernel.cc
@@ -88,7 +88,7 @@ PD_REGISTER_KERNEL(batch_norm_coo_grad,
 }
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_MUSAAA)
 PD_REGISTER_KERNEL(batch_norm_coo_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/sparse/batch_norm_kernel.cc b/paddle/phi/kernels/sparse/batch_norm_kernel.cc
index 5ea531bbab1c4..947c62bd74a22 100644
--- a/paddle/phi/kernels/sparse/batch_norm_kernel.cc
+++ b/paddle/phi/kernels/sparse/batch_norm_kernel.cc
@@ -92,7 +92,7 @@ PD_REGISTER_KERNEL(batch_norm_coo,
 }
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_MUSAAA)
 PD_REGISTER_KERNEL(batch_norm_coo,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/sparse/empty_kernel.cc b/paddle/phi/kernels/sparse/empty_kernel.cc
index 44ccdd3bda634..b3cda1fd6aa2d 100644
--- a/paddle/phi/kernels/sparse/empty_kernel.cc
+++ b/paddle/phi/kernels/sparse/empty_kernel.cc
@@ -82,7 +82,7 @@ PD_REGISTER_KERNEL(empty_like_csr,
   kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 PD_REGISTER_KERNEL(empty_like_coo,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/sparse/gpu/conv.cu.h b/paddle/phi/kernels/sparse/gpu/conv.cu.h
index 68ca818bad303..ad759d94c3ddf 100644
--- a/paddle/phi/kernels/sparse/gpu/conv.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/conv.cu.h
@@ -19,6 +19,9 @@ limitations under the License. */
 #ifdef __NVCC__
 #include <cub/block/block_scan.cuh>
 #endif
+#ifdef __MUSACC__
+#include <cub/block/block_scan.cuh>
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc b/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc
index 8e9ed654760f3..a9f16a255ba1e 100644
--- a/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc
@@ -81,7 +81,7 @@ PD_REGISTER_KERNEL(sparse_coo_tensor_grad,
   kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_COO);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 PD_REGISTER_KERNEL(values_coo_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/squeeze_grad_kernel.cc b/paddle/phi/kernels/squeeze_grad_kernel.cc
index 3eab4daf5740a..e08efefe315e0 100644
--- a/paddle/phi/kernels/squeeze_grad_kernel.cc
+++ b/paddle/phi/kernels/squeeze_grad_kernel.cc
@@ -49,7 +49,7 @@ PD_REGISTER_KERNEL(squeeze_grad,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 PD_REGISTER_KERNEL(squeeze_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/squeeze_kernel.cc b/paddle/phi/kernels/squeeze_kernel.cc
index 933540cd787e4..c12bf472f4809 100644
--- a/paddle/phi/kernels/squeeze_kernel.cc
+++ b/paddle/phi/kernels/squeeze_kernel.cc
@@ -74,7 +74,7 @@ PD_REGISTER_KERNEL(squeeze,
                    int64_t,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 PD_REGISTER_KERNEL(squeeze_infer,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/strided_slice_grad_kernel.cc b/paddle/phi/kernels/strided_slice_grad_kernel.cc
index dd5bd42a3f48a..cf6fbef1c4444 100644
--- a/paddle/phi/kernels/strided_slice_grad_kernel.cc
+++ b/paddle/phi/kernels/strided_slice_grad_kernel.cc
@@ -55,7 +55,7 @@ PD_REGISTER_KERNEL(strided_slice_grad,
                    phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 PD_REGISTER_KERNEL(strided_slice_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/strided_slice_kernel.cc b/paddle/phi/kernels/strided_slice_kernel.cc
index 79e43de25e9a8..6ebe60a6bb1f1 100644
--- a/paddle/phi/kernels/strided_slice_kernel.cc
+++ b/paddle/phi/kernels/strided_slice_kernel.cc
@@ -46,7 +46,7 @@ PD_REGISTER_KERNEL(strided_slice,
                    phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 PD_REGISTER_KERNEL(strided_slice,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/strings/case_utils.h b/paddle/phi/kernels/strings/case_utils.h
index 66744c6915bc6..acbd2cce88b38 100644
--- a/paddle/phi/kernels/strings/case_utils.h
+++ b/paddle/phi/kernels/strings/case_utils.h
@@ -17,7 +17,7 @@ limitations under the License. */
 
 #include "paddle/phi/common/pstring.h"
 #include "paddle/phi/kernels/strings/unicode.h"
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
 
diff --git a/paddle/phi/kernels/strings/gpu/strings_lower_upper_kernel.cu b/paddle/phi/kernels/strings/gpu/strings_lower_upper_kernel.cu
index 832d9bbf73c0b..ffe87317d0623 100644
--- a/paddle/phi/kernels/strings/gpu/strings_lower_upper_kernel.cu
+++ b/paddle/phi/kernels/strings/gpu/strings_lower_upper_kernel.cu
@@ -8,7 +8,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
+#if 0
 #include "paddle/phi/kernels/strings/strings_lower_upper_kernel.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
@@ -178,3 +178,4 @@ PD_REGISTER_KERNEL_FOR_ALL_DTYPE(
     GPU,
     ALL_LAYOUT,
     phi::strings::StringUpperKernel<phi::GPUContext>) {}
+#endif
diff --git a/paddle/phi/kernels/transfer_layout_kernel.cc b/paddle/phi/kernels/transfer_layout_kernel.cc
index 5ee69e5964918..16aeb761d308f 100644
--- a/paddle/phi/kernels/transfer_layout_kernel.cc
+++ b/paddle/phi/kernels/transfer_layout_kernel.cc
@@ -221,7 +221,7 @@ PD_REGISTER_KERNEL_FOR_ALL_DTYPE(transfer_layout,
                                  CPU,
                                  ALL_LAYOUT,
                                  phi::TransferLayoutKernel<phi::CPUContext>) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 PD_REGISTER_KERNEL_FOR_ALL_DTYPE(transfer_layout,
                                  GPU,
                                  ALL_LAYOUT,
diff --git a/paddle/phi/kernels/unsqueeze_grad_kernel.cc b/paddle/phi/kernels/unsqueeze_grad_kernel.cc
index e294c3a983769..c5aea299bfdd8 100644
--- a/paddle/phi/kernels/unsqueeze_grad_kernel.cc
+++ b/paddle/phi/kernels/unsqueeze_grad_kernel.cc
@@ -49,7 +49,7 @@ PD_REGISTER_KERNEL(unsqueeze_grad,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 PD_REGISTER_KERNEL(unsqueeze_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/unsqueeze_kernel.cc b/paddle/phi/kernels/unsqueeze_kernel.cc
index 6e03176857e4c..f377952e8438e 100644
--- a/paddle/phi/kernels/unsqueeze_kernel.cc
+++ b/paddle/phi/kernels/unsqueeze_kernel.cc
@@ -80,7 +80,7 @@ PD_REGISTER_KERNEL(unsqueeze,
                    int64_t,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
 PD_REGISTER_KERNEL(unsqueeze_infer,
                    GPU,
                    ALL_LAYOUT,
diff --git a/test/cpp/fluid/test_leaky_relu_grad_grad_functor.h b/test/cpp/fluid/test_leaky_relu_grad_grad_functor.h
index 5b51d8ddb00fc..f172e549b3973 100644
--- a/test/cpp/fluid/test_leaky_relu_grad_grad_functor.h
+++ b/test/cpp/fluid/test_leaky_relu_grad_grad_functor.h
@@ -95,7 +95,7 @@ static bool TestLeakyReluGradGradMain(const framework::DDim &dim,
 
   int64_t limit = x.numel();
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
   if (platform::is_gpu_place(place)) {
     auto &cuda_dev_ctx = dynamic_cast<phi::GPUContext &>(dev_ctx);
     functor(cuda_dev_ctx, &x, out, &ddx, &ddout, dout, dx);
@@ -107,7 +107,7 @@ static bool TestLeakyReluGradGradMain(const framework::DDim &dim,
     functor(cpu_dev_ctx, &x, out, &ddx, &ddout, dout, dx);
     platform::ForRange<phi::CPUContext> for_range(cpu_dev_ctx, limit);
     for_range(actual_functor);
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
   }
 #endif
 
diff --git a/test/cpp/phi/kernels/test_strings_lower_upper_dev_api.cu b/test/cpp/phi/kernels/test_strings_lower_upper_dev_api.cu
index a79a376a2d995..7e9bdff690d39 100644
--- a/test/cpp/phi/kernels/test_strings_lower_upper_dev_api.cu
+++ b/test/cpp/phi/kernels/test_strings_lower_upper_dev_api.cu
@@ -20,7 +20,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
-#if (defined(__NVCC__) || defined(__HIPCC__))
+#if (defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__))
 #include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
 #endif
diff --git a/test/custom_op/custom_raw_op_kernel_op.h b/test/custom_op/custom_raw_op_kernel_op.h
index 24cea81b9eb91..7ce663c3c5b8b 100644
--- a/test/custom_op/custom_raw_op_kernel_op.h
+++ b/test/custom_op/custom_raw_op_kernel_op.h
@@ -59,7 +59,7 @@ struct ReluFunctor {
     for_range(functor);                                          \
   } while (0)
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
     if (paddle::platform::is_gpu_place(place)) {
       LAUNCH_RELU_KERNEL(phi::GPUContext);
       return;

From b13c8bf44d83276d06c704e523aba4b90383d217 Mon Sep 17 00:00:00 2001
From: CaiZhi <zhi.cai@mthreads.com>
Date: Sun, 30 Jul 2023 17:42:54 +0800
Subject: [PATCH 11/55] [MTAI] feat(build): support compiling funcs/eigen/*.cu

---
 paddle/fluid/inference/api/analysis_predictor.cc |  2 +-
 paddle/fluid/platform/device/gpu/gpu_types.h     |  2 +-
 paddle/fluid/pybind/eager_method.cc              |  2 ++
 paddle/fluid/pybind/parallel_executor.cc         |  2 +-
 paddle/fluid/pybind/place.cc                     |  4 ++--
 paddle/fluid/pybind/pybind.cc                    |  4 ++--
 paddle/fluid/pybind/tensor.cc                    |  2 +-
 paddle/phi/backends/gpu/forwards.h               |  4 ++++
 paddle/phi/backends/gpu/gpu_decls.h              |  8 ++++----
 paddle/phi/kernels/CMakeLists.txt                | 10 +++++-----
 paddle/phi/kernels/funcs/CMakeLists.txt          | 12 +++++++++---
 paddle/phi/kernels/funcs/eigen/CMakeLists.txt    |  3 ++-
 paddle/phi/kernels/funcs/eigen/extensions.h      |  4 ++--
 13 files changed, 36 insertions(+), 23 deletions(-)

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 12af725b6e407..a59db2d3c062a 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -2162,7 +2162,7 @@ bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) {
   if (stream != predictor_stream_) {
 #if defined(PADDLE_WITH_HIP)
     hipStreamSynchronize(static_cast<gpuStream_t>(predictor_stream_));
-#elif defined(PADDLE_WITH_HIP)
+#elif defined(PADDLE_WITH_MUSA)
     musaStreamSynchronize(static_cast<gpuStream_t>(predictor_stream_));
 #else
     cudaStreamSynchronize(static_cast<gpuStream_t>(predictor_stream_));
diff --git a/paddle/fluid/platform/device/gpu/gpu_types.h b/paddle/fluid/platform/device/gpu/gpu_types.h
index 060a9161c46ad..1e08ca8d0b521 100644
--- a/paddle/fluid/platform/device/gpu/gpu_types.h
+++ b/paddle/fluid/platform/device/gpu/gpu_types.h
@@ -27,7 +27,7 @@
 #include <musa_runtime.h>
 #include <mublas.h>
 #include <mudnn.h>
-using mudnnHandle_t = ::musa::dnn::Handle*;
+using mudnnHandle_t = class Handle*;
 //TODO(Xiaokang Shang)
 #else
 #include <cuda_runtime.h>
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index 638542ea6dbaf..e8a8d5e12870f 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -230,6 +230,8 @@ static PyObject* tensor_method_numpy(TensorObject* self,
     gpuMemcpyKind kind = cudaMemcpyDeviceToHost;
 #elif defined(PADDLE_WITH_HIP)
     gpuMemcpyKind kind = hipMemcpyDeviceToHost;
+#elif defined(PADDLE_WITH_MUSA)
+    gpuMemcpyKind kind = musaMemcpyDeviceToHost;
 #endif
     if (self->tensor.is_selected_rows()) {
       VLOG(6) << "Getting SelectedRows's numpy value";
diff --git a/paddle/fluid/pybind/parallel_executor.cc b/paddle/fluid/pybind/parallel_executor.cc
index 013cac0851154..f8d3b007e3517 100644
--- a/paddle/fluid/pybind/parallel_executor.cc
+++ b/paddle/fluid/pybind/parallel_executor.cc
@@ -130,7 +130,7 @@ limitations under the License. */
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
 #endif
-#ifndef PADDLE_WITH_HIP
+#ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h"
 #endif
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
diff --git a/paddle/fluid/pybind/place.cc b/paddle/fluid/pybind/place.cc
index 2edb4c80d4897..6cbdb5c29da02 100644
--- a/paddle/fluid/pybind/place.cc
+++ b/paddle/fluid/pybind/place.cc
@@ -375,7 +375,7 @@ void BindPlace(pybind11::module &m) {  // NOLINT
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool {
   // Only GPUs with Compute Capability >= 53 support float16
-#ifdef PADDLE_WITH_HIP || PADDLE_WITH_MUSA
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     return true;
 #else
     return platform::GetGPUComputeCapability(place.device) >= 53;
@@ -383,7 +383,7 @@ void BindPlace(pybind11::module &m) {  // NOLINT
   });
   m.def("is_bfloat16_supported", [](const platform::CUDAPlace &place) -> bool {
   // Only GPUs with Compute Capability >= 80 support bfloat16
-#ifdef PADDLE_WITH_HIP || PADDLE_WITH_MUSA
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     return false;
 #else
     return platform::GetGPUComputeCapability(place.device) >= 80;
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 3f5fffc1bc036..8a03362cdd2f3 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -148,7 +148,7 @@ limitations under the License. */
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
 #endif
-#ifndef PADDLE_WITH_HIP
+#ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h"
 #endif
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
@@ -2245,7 +2245,7 @@ All parameter, weight, gradient are variables in Paddle.
         return ostr.str();
       });
 
-#if !defined(PADDLE_WITH_HIP) && !defined(_WIN32)
+#if !defined(PADDLE_WITH_HIP) && !defined(_WIN32) && !defined(PADDLE_WITH_MUSA)
   m.def("nvprof_init", platform::CudaProfilerInit);
   m.def("nvprof_start", platform::CudaProfilerStart);
   m.def("nvprof_stop", platform::CudaProfilerStop);
diff --git a/paddle/fluid/pybind/tensor.cc b/paddle/fluid/pybind/tensor.cc
index a9ce5910d4eb4..7d5d645a3b195 100644
--- a/paddle/fluid/pybind/tensor.cc
+++ b/paddle/fluid/pybind/tensor.cc
@@ -130,7 +130,7 @@ limitations under the License. */
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
 #endif
-#ifndef PADDLE_WITH_HIP
+#ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h"
 #endif
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
diff --git a/paddle/phi/backends/gpu/forwards.h b/paddle/phi/backends/gpu/forwards.h
index 7a66475b13aa9..56a590793046c 100644
--- a/paddle/phi/backends/gpu/forwards.h
+++ b/paddle/phi/backends/gpu/forwards.h
@@ -76,6 +76,10 @@ using ncclComm_t = struct ncclComm *;
 using musaStream_t = struct MUstream_st *;
 using musaEvent_t = struct MUevent_st *;
 using mublasHandle_t = struct _mublasHandle_t*;
+using mudnnHandle_t = class Handle*;
+using musolverDnHandle_t = bool**;
+using mublasLtHandle_t = struct _mublasHandle_t*;
+using musparseHandle_t = bool**;
 
 /// Forward declaration of ROCM types.
 #include <cstddef>
diff --git a/paddle/phi/backends/gpu/gpu_decls.h b/paddle/phi/backends/gpu/gpu_decls.h
index 4020e811f7aca..d0a9595a66b16 100644
--- a/paddle/phi/backends/gpu/gpu_decls.h
+++ b/paddle/phi/backends/gpu/gpu_decls.h
@@ -25,10 +25,10 @@ namespace phi {
 
 #elif defined(PADDLE_WITH_MUSA)
   //using mudnnHandle_t = ::musa::dnn::Handle;
-  using mudnnHandle_t = bool**;
-  using mublasLtHandle_t = bool**;
-  using musparseHandle_t = bool**;
-  using musolverDnHandle_t = bool**;
+  //using mudnnHandle_t = bool**;
+  //using mublasLtHandle_t = bool**;
+  //using musparseHandle_t = bool**;
+  //using musolverDnHandle_t = bool**;
 #define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \
   using GPU_TYPE = MUSA_TYPE;
 
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index e57ac9d80f6c1..84fb8f95337fa 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -42,7 +42,7 @@ file(GLOB kernel_primitive_h "primitive/*.h")
 file(
   GLOB kernel_cu
   RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
-  "gpu/a*.cu")
+  "gpu/scale_kernel.cu")
 
 if(APPLE OR WIN32)
   list(REMOVE_ITEM kernel_cu "fusion/gpu/fusion_group_kernel.cu")
@@ -121,10 +121,10 @@ file(
   "xpu/*.cc" "legacy/xpu/*.cc" "selected_rows/xpu/*.cc" "fusion/xpu/*.cc"
   "sparse/xpu/*.cc")
 
-#if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
-#  collect_srcs(kernels_srcs SRCS ${kernel_cu})
-#  kernel_declare("${kernel_cu}")
-#endif()
+if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
+  collect_srcs(kernels_srcs SRCS ${kernel_cu})
+  kernel_declare("${kernel_cu}")
+endif()
 
 if(WITH_XPU)
   if(WITH_XPU_KP)
diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt
index 662b9275aa7aa..8a431efd786d5 100644
--- a/paddle/phi/kernels/funcs/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/CMakeLists.txt
@@ -8,12 +8,18 @@ file(
   GLOB func_cc_srcs
   RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
   "*.cc")
+#if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
+#  file(
+#    GLOB func_cu_srcs
+#    RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+#    "*.cu")
+#endif()
 if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
   file(
     GLOB func_cu_srcs
     RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
-    "*.cu")
+    "concat_and_split_functor.cu")
 endif()
 
-#collect_srcs(kernels_srcs SRCS ${func_cc_srcs} ${func_cu_srcs})
-collect_srcs(kernels_srcs SRCS ${func_cc_srcs})
+collect_srcs(kernels_srcs SRCS ${func_cc_srcs} ${func_cu_srcs})
+#collect_srcs(kernels_srcs SRCS ${func_cc_srcs})
diff --git a/paddle/phi/kernels/funcs/eigen/CMakeLists.txt b/paddle/phi/kernels/funcs/eigen/CMakeLists.txt
index 30d6dc6013cf8..a03783c90b9be 100644
--- a/paddle/phi/kernels/funcs/eigen/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/eigen/CMakeLists.txt
@@ -5,6 +5,7 @@ file(
 file(
   GLOB EIGEN_CU_SOURCES
   RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
-  "*.cu")
+  "s*.cu"
+  "p*.cu")
 
 collect_srcs(kernels_srcs SRCS ${EIGEN_CC_SOURCES} ${EIGEN_CU_SOURCES})
diff --git a/paddle/phi/kernels/funcs/eigen/extensions.h b/paddle/phi/kernels/funcs/eigen/extensions.h
index 4189faea8faa9..d4b182c4d4fce 100644
--- a/paddle/phi/kernels/funcs/eigen/extensions.h
+++ b/paddle/phi/kernels/funcs/eigen/extensions.h
@@ -131,7 +131,7 @@ struct NumTraits<float16> : GenericNumTraits<float16> {
     return phi::dtype::raw_uint16_to_float16(0x7c01);
   }
 };
-#if 0
+
 namespace numext {
 
 //////////// bfloat methods /////////////
@@ -435,7 +435,7 @@ HOSTDEVICE inline float16 maxi(const float16& a, const float16& b) {
 }
 
 }  // namespace numext
-#endif
+
 }  // namespace Eigen
 
 #endif  // __xpu__

From e8e8b3eb65860bad63e139581e3f5b3d4aa0510e Mon Sep 17 00:00:00 2001
From: CaiZhi <zhi.cai@mthreads.com>
Date: Mon, 31 Jul 2023 11:35:45 +0800
Subject: [PATCH 12/55] [MTAI] feat(build): fix compiling error for MUSA

---
 cmake/musa.cmake                              |   1 +
 paddle/fluid/framework/ir/CMakeLists.txt      |   2 +-
 paddle/phi/core/utils/array.h                 |   8 +-
 paddle/phi/kernels/CMakeLists.txt             |  31 +-
 paddle/phi/kernels/empty_kernel.cc            |   2 +-
 paddle/phi/kernels/funcs/CMakeLists.txt       |   8 +-
 .../phi/kernels/funcs/check_numerics_utils.h  |   2 +-
 paddle/phi/kernels/funcs/elementwise_base.h   |  48 +-
 paddle/phi/kernels/funcs/math_function.cu     |  36 +-
 paddle/phi/kernels/funcs/scatter.cu.h         |   4 +-
 paddle/phi/kernels/funcs/segment_pooling.cu   |  10 +-
 paddle/phi/kernels/funcs/softmax.cu           |   4 +-
 paddle/phi/kernels/gpu/cholesky_kernel.cu     | 225 ---------
 .../phi/kernels/gpu/cholesky_solve_kernel.cu  | 140 ------
 .../phi/kernels/gpu/cudnn_lstm_grad_kernel.cu | 320 -------------
 paddle/phi/kernels/gpu/cudnn_lstm_kernel.cu   | 426 ------------------
 paddle/phi/kernels/gpu/send_ue_recv_kernel.cu |   2 +-
 paddle/phi/kernels/gpu/sgd_kernel.cu          |  16 +
 paddle/phi/kernels/gpu/svd_kernel.cu          | 268 -----------
 paddle/phi/kernels/impl/matmul_kernel_impl.h  | 302 ++++++-------
 paddle/phi/kernels/reduce_sum_kernel.cc       |   2 +-
 .../elementwise_multiply_kernel.cc            |   2 +-
 22 files changed, 267 insertions(+), 1592 deletions(-)
 delete mode 100644 paddle/phi/kernels/gpu/cholesky_kernel.cu
 delete mode 100644 paddle/phi/kernels/gpu/cholesky_solve_kernel.cu
 delete mode 100644 paddle/phi/kernels/gpu/cudnn_lstm_grad_kernel.cu
 delete mode 100644 paddle/phi/kernels/gpu/cudnn_lstm_kernel.cu
 delete mode 100644 paddle/phi/kernels/gpu/svd_kernel.cu

diff --git a/cmake/musa.cmake b/cmake/musa.cmake
index c6701f33858f8..1349c5afead4b 100644
--- a/cmake/musa.cmake
+++ b/cmake/musa.cmake
@@ -26,6 +26,7 @@ else()
   list(APPEND MUSA_MCC_FLAGS -std=c++17)
 endif()
 
+list(APPEND MUSA_MCC_FLAGS --cuda-gpu-arch=mp_21)
 list(APPEND MUSA_MCC_FLAGS -U__CUDA__)
 #set(MUSA_VERBOSE_BUILD ON)
 if(CMAKE_BUILD_TYPE MATCHES Debug)
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 6f1075c3bf16d..cdb0c3596a683 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -192,7 +192,7 @@ if(WITH_MKLDNN)
   pass_library(operator_unsqueeze2_onednn_fuse_pass inference DIR mkldnn)
   pass_library(operator_reshape2_onednn_fuse_pass inference DIR mkldnn)
   pass_library(cpu_quantize_placement_pass base DIR mkldnn)
-  pass_library(cpu_quantize_pass inference DIR mkldnn)
+  #pass_library(cpu_quantize_pass inference DIR mkldnn)
   pass_library(cpu_quantize_squash_pass inference DIR mkldnn)
   pass_library(reshape_transpose_matmul_mkldnn_fuse_pass inference DIR mkldnn)
   pass_library(matmul_transpose_reshape_mkldnn_fuse_pass inference DIR mkldnn)
diff --git a/paddle/phi/core/utils/array.h b/paddle/phi/core/utils/array.h
index 44290b73737fb..2ebf2f933b77a 100644
--- a/paddle/phi/core/utils/array.h
+++ b/paddle/phi/core/utils/array.h
@@ -54,7 +54,7 @@ class Array {
   }
 
   HOSTDEVICE inline T &at(size_t i) {
-#if !defined(__CUDA_ARCH__) && !defined(__HIPCC__)
+#if !defined(__CUDA_ARCH__) && !defined(__HIPCC__) && !defined(__MUSACC__)
     PADDLE_ENFORCE_LT(
         i, N, phi::errors::OutOfRange("Array index out of bounds."));
 #endif
@@ -62,7 +62,7 @@ class Array {
   }
 
   HOSTDEVICE inline const T &at(size_t i) const {
-#if !defined(__CUDA_ARCH__) && !defined(__HIPCC__)
+#if !defined(__CUDA_ARCH__) && !defined(__HIPCC__) && !defined(__MUSACC__)
     PADDLE_ENFORCE_LT(
         i, N, phi::errors::OutOfRange("Array index out of bounds."));
 #endif
@@ -103,7 +103,7 @@ class Array<T, 0> {
   HOSTDEVICE inline T *GetMutable() { return nullptr; }
 
   HOSTDEVICE inline T &operator[](size_t) {
-#if defined(__HIPCC__) || defined(__CUDA_ARCH__)
+#if defined(__HIPCC__) || defined(__CUDA_ARCH__) || defined(__MUSA_ARCH__)
     // HIP and CUDA will have compile error, if use "obj()"
     // function declared in block scope cannot have 'static' storage class
     static T obj{};
@@ -114,7 +114,7 @@ class Array<T, 0> {
   }
 
   HOSTDEVICE inline const T &operator[](size_t) const {
-#if defined(__HIPCC__) || defined(__CUDA_ARCH__)
+#if defined(__HIPCC__) || defined(__CUDA_ARCH__) || defined(__MUSA_ARCH__)
     // HIP and CUDA will have compile error, if use "obj()"
     // function declared in block scope cannot have 'static' storage class
     static const T obj{};
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index 84fb8f95337fa..9928c46563b0f 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -42,7 +42,36 @@ file(GLOB kernel_primitive_h "primitive/*.h")
 file(
   GLOB kernel_cu
   RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
-  "gpu/scale_kernel.cu")
+  "gpu/s*.cu.cc"
+  "gpu/c*.cu"
+  "gpu/s*.cu"
+  "gpu/full_kernel.cu"
+  "gpu/matmul_kernel.cu"
+  "gpu/expand_kernel.cu"
+  "kps/*.cu"
+  "legacy/kps/*.cu"
+  )
+list(REMOVE_ITEM kernel_cu "gpu/check_numerics_kernel.cu" "gpu/cross_entropy_grad_kernel.cu" 
+	"gpu/instance_norm_grad_kernel.cu"
+	"gpu/cross_entropy_kernel.cu"
+	"gpu/cholesky_grad_kernel.cu"
+	"gpu/cholesky_solve_grad_kernel.cu"
+	"gpu/conv_transpose_kernel.cu"
+	"gpu/conv_grad_kernel.cu"
+	"gpu/solve_kernel.cu"
+	"gpu/solve_grad_kernel.cu"
+	"gpu/stft_kernel.cu"
+	"gpu/conv_kernel.cu"
+	"gpu/softmax_kernel.cu"
+	"gpu/slogdeterminant_grad_kernel.cu"
+	"gpu/spectral_norm_grad_kernel.cu"
+	"gpu/spectral_norm_kernel.cu"
+	"gpu/svd_grad_kernel.cu"
+        "gpu/conv_transpose_grad_kernel.cu")
+#file(
+#  GLOB kernel_cu
+#  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+#  "gpu/scale_kernel.cu")
 
 if(APPLE OR WIN32)
   list(REMOVE_ITEM kernel_cu "fusion/gpu/fusion_group_kernel.cu")
diff --git a/paddle/phi/kernels/empty_kernel.cc b/paddle/phi/kernels/empty_kernel.cc
index 76377d201e274..54449200ae4b2 100644
--- a/paddle/phi/kernels/empty_kernel.cc
+++ b/paddle/phi/kernels/empty_kernel.cc
@@ -74,7 +74,7 @@ PD_REGISTER_KERNEL(empty_like,
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(empty,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt
index 8a431efd786d5..a7ad03dad4e80 100644
--- a/paddle/phi/kernels/funcs/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/CMakeLists.txt
@@ -18,7 +18,13 @@ if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
   file(
     GLOB func_cu_srcs
     RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
-    "concat_and_split_functor.cu")
+    "concat_and_split_functor.cu"
+    "math_function.cu"
+    "segment_pooling.cu"
+    "sequence_pooling.cu"
+    "softmax.cu"
+    "matrix_inverse.cu"
+    "gather_scatter_functor.cu")
 endif()
 
 collect_srcs(kernels_srcs SRCS ${func_cc_srcs} ${func_cu_srcs})
diff --git a/paddle/phi/kernels/funcs/check_numerics_utils.h b/paddle/phi/kernels/funcs/check_numerics_utils.h
index 473d7994058a8..7f618fa3b3f33 100644
--- a/paddle/phi/kernels/funcs/check_numerics_utils.h
+++ b/paddle/phi/kernels/funcs/check_numerics_utils.h
@@ -86,7 +86,7 @@ HOSTDEVICE static void PrintAndThrowError(const char* debug_info,
                                           int64_t num_nan,
                                           int64_t num_inf,
                                           int64_t num_zero) {
-#if !defined(__HIPCC__) && !defined(__CUDA_ARCH__)
+#if !defined(__HIPCC__) && !defined(__CUDA_ARCH__) && !defined(__MUSA_ARCH__)
   PADDLE_THROW(phi::errors::PreconditionNotMet(
       "There are NAN or INF (num_nan=%lld, num_inf=%lld, num_zero=%lld) in "
       "%s.",
diff --git a/paddle/phi/kernels/funcs/elementwise_base.h b/paddle/phi/kernels/funcs/elementwise_base.h
index 683696f810c80..3aba2ccf3aff3 100644
--- a/paddle/phi/kernels/funcs/elementwise_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_base.h
@@ -711,30 +711,30 @@ __global__ void VectorizedElementwiseKernel(
     kps::IndexType main_offset,
     int read_lens,
     Functor func) {
-  kps::IndexType data_offset =
-      static_cast<kps::IndexType>(BLOCK_ID_X) * BLOCK_NUM_X * read_lens;
-  kps::IndexType stride =
-      static_cast<kps::IndexType>(BLOCK_NUM_X) * GRID_NUM_X * read_lens;
-  for (; data_offset < main_offset; data_offset += stride) {
-    VectorizedElementwiseKernelImpl<OutT,
-                                    Functor,
-                                    Arity,
-                                    NumOuts,
-                                    VecSize,
-                                    false>(
-        ins, outs, data_offset, read_lens * BLOCK_NUM_X, read_lens, func);
-  }
-
-  kps::IndexType remain = numel - data_offset;
-  if (remain > 0) {
-    VectorizedElementwiseKernelImpl<OutT,
-                                    Functor,
-                                    Arity,
-                                    NumOuts,
-                                    VecSize,
-                                    true>(
-        ins, outs, data_offset, static_cast<int>(remain), read_lens, func);
-  }
+  //kps::IndexType data_offset =
+  //    static_cast<kps::IndexType>(BLOCK_ID_X) * BLOCK_NUM_X * read_lens;
+  //kps::IndexType stride =
+  //    static_cast<kps::IndexType>(BLOCK_NUM_X) * GRID_NUM_X * read_lens;
+  //for (; data_offset < main_offset; data_offset += stride) {
+  //  VectorizedElementwiseKernelImpl<OutT,
+  //                                  Functor,
+  //                                  Arity,
+  //                                  NumOuts,
+  //                                  VecSize,
+  //                                  false>(
+  //      ins, outs, data_offset, read_lens * BLOCK_NUM_X, read_lens, func);
+  //}
+
+  //kps::IndexType remain = numel - data_offset;
+  //if (remain > 0) {
+  //  VectorizedElementwiseKernelImpl<OutT,
+  //                                  Functor,
+  //                                  Arity,
+  //                                  NumOuts,
+  //                                  VecSize,
+  //                                  true>(
+  //      ins, outs, data_offset, static_cast<int>(remain), read_lens, func);
+  //}
 }
 
 template <typename OutT, typename Functor, int Arity, int NumOuts, int VecSize>
diff --git a/paddle/phi/kernels/funcs/math_function.cu b/paddle/phi/kernels/funcs/math_function.cu
index 5b9fecbd43b30..0b94c14d8bbbf 100644
--- a/paddle/phi/kernels/funcs/math_function.cu
+++ b/paddle/phi/kernels/funcs/math_function.cu
@@ -430,15 +430,15 @@ void ColwiseSum<phi::GPUContext, double>::operator()(
 
   SetConstant<phi::GPUContext, double> set;
   set(context, &one, static_cast<double>(1.0));
-  phi::funcs::GetBlas<phi::GPUContext, double>(context).GEMV(
-      true,
-      static_cast<int>(in_dims[0]),
-      static_cast<int>(in_dims[1]),
-      1.0,
-      input.data<double>(),
-      one.data<double>(),
-      0.0,
-      vector->data<double>());
+  //phi::funcs::GetBlas<phi::GPUContext, double>(context).GEMV(
+  //    true,
+  //    static_cast<int>(in_dims[0]),
+  //    static_cast<int>(in_dims[1]),
+  //    1.0,
+  //    input.data<double>(),
+  //    one.data<double>(),
+  //    0.0,
+  //    vector->data<double>());
 }
 
 template struct RowwiseSum<phi::GPUContext, float>;
@@ -468,15 +468,15 @@ void RowwiseSum<phi::GPUContext, double>::operator()(
 
   SetConstant<phi::GPUContext, double> set;
   set(context, &one, static_cast<double>(1.0));
-  phi::funcs::GetBlas<phi::GPUContext, double>(context).GEMV(
-      true,
-      static_cast<int>(in_dims[1]),
-      static_cast<int>(in_dims[0]),
-      1.0,
-      one.data<double>(),
-      input.data<double>(),
-      0.0,
-      vector->data<double>());
+  //phi::funcs::GetBlas<phi::GPUContext, double>(context).GEMV(
+  //    true,
+  //    static_cast<int>(in_dims[1]),
+  //    static_cast<int>(in_dims[0]),
+  //    1.0,
+  //    one.data<double>(),
+  //    input.data<double>(),
+  //    0.0,
+  //    vector->data<double>());
 }
 
 template struct RowwiseMean<phi::GPUContext, float>;
diff --git a/paddle/phi/kernels/funcs/scatter.cu.h b/paddle/phi/kernels/funcs/scatter.cu.h
index 19a391ea150b6..2e9f551585e50 100644
--- a/paddle/phi/kernels/funcs/scatter.cu.h
+++ b/paddle/phi/kernels/funcs/scatter.cu.h
@@ -76,7 +76,7 @@ __global__ void ScatterCUDAKernel(const T* params,
     if (overwrite) {
       *(output + out_i) = *(params + i);
     } else {
-      phi::CudaAtomicAdd(output + out_i, *(params + i));
+      //phi::CudaAtomicAdd(output + out_i, *(params + i));
     }
   }
 }
@@ -110,7 +110,7 @@ __global__ void ScatterNdCUDAKernel(const T* update,
       temp *= output_dims[j];
     }
     int64_t output_i = gather_i + slice_i;
-    phi::CudaAtomicAdd(output + output_i, *(update + i));
+    //phi::CudaAtomicAdd(output + output_i, *(update + i));
   }
 }
 
diff --git a/paddle/phi/kernels/funcs/segment_pooling.cu b/paddle/phi/kernels/funcs/segment_pooling.cu
index 81b8930c38298..dbc4dbc31db84 100644
--- a/paddle/phi/kernels/funcs/segment_pooling.cu
+++ b/paddle/phi/kernels/funcs/segment_pooling.cu
@@ -24,7 +24,7 @@ limitations under the License. */
 
 namespace phi {
 namespace funcs {
-#if 0
+
 using Tensor = DenseTensor;
 
 template <typename T, typename Index, int DimTileSize>
@@ -216,7 +216,8 @@ class MaxPool {
   DEVICE inline T initial() { return static_cast<T>(-FLT_MAX); }
   DEVICE inline void compute(const T& x, T* y) { *y = *y > x ? *y : x; }
   DEVICE inline T atomic(T* address, const T val) {
-    return phi::CudaAtomicMax(address, val);
+    //return phi::CudaAtomicMax(address, val);
+    return val;
   }
 };
 
@@ -226,7 +227,8 @@ class MinPool {
   DEVICE inline T initial() { return static_cast<T>(FLT_MAX); }
   DEVICE inline void compute(const T& x, T* y) { *y = *y < x ? *y : x; }
   DEVICE inline T atomic(T* address, const T val) {
-    return phi::CudaAtomicMin(address, val);
+    //return phi::CudaAtomicMin(address, val);
+    return val;
   }
 };
 
@@ -467,6 +469,6 @@ template class SegmentPoolGradFunctor<GPU, float16, int>;
 template class SegmentPoolGradFunctor<GPU, float16, int64_t>;
 template class SegmentPoolGradFunctor<GPU, phi::dtype::bfloat16, int>;
 template class SegmentPoolGradFunctor<GPU, phi::dtype::bfloat16, int64_t>;
-#endif
+
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/softmax.cu b/paddle/phi/kernels/funcs/softmax.cu
index c426e6c15e79f..24c1d8827cb49 100644
--- a/paddle/phi/kernels/funcs/softmax.cu
+++ b/paddle/phi/kernels/funcs/softmax.cu
@@ -21,7 +21,7 @@ limitations under the License. */
 
 namespace phi {
 namespace funcs {
-
+#if 0
 using ScopedTensorDescriptor = phi::backends::gpu::ScopedTensorDescriptor;
 using DataLayout = phi::backends::gpu::DataLayout;
 template <typename T>
@@ -157,7 +157,7 @@ template class SoftmaxGradCUDNNFunctor<phi::dtype::float16, phi::GPUContext>;
 template class SoftmaxCUDNNFunctor<double, phi::GPUContext>;
 template class SoftmaxGradCUDNNFunctor<double, phi::GPUContext>;
 #endif
-
+#endif 
 template class SoftmaxFunctor<phi::GPUContext, phi::dtype::float16>;
 template class SoftmaxFunctor<phi::GPUContext, phi::dtype::bfloat16>;
 template class SoftmaxFunctor<phi::GPUContext, float>;
diff --git a/paddle/phi/kernels/gpu/cholesky_kernel.cu b/paddle/phi/kernels/gpu/cholesky_kernel.cu
deleted file mode 100644
index 803c0ab6df585..0000000000000
--- a/paddle/phi/kernels/gpu/cholesky_kernel.cu
+++ /dev/null
@@ -1,225 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_CUDA
-// HIP and MUSA not support cusolver
-
-#include "paddle/phi/kernels/cholesky_kernel.h"
-
-#include <thrust/device_vector.h>
-
-#include <algorithm>
-#include <vector>
-
-#include "paddle/phi/backends/dynload/cusolver.h"
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/common/memory_utils.h"
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/funcs/for_range.h"
-
-namespace phi {
-
-template <typename T>
-struct MatrixBandPartFunctor {
-  /*! Set output as input value outside a central band and 0 inside that band.
-   * That is: output[i, j, ..., m, n] = in_band(m, n) * input[i, j, ..., m, n]
-   * where: in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) && (num_upper
-   * < 0 || (n-m) <= num_upper)
-   */
-  MatrixBandPartFunctor(const int m,
-                        const int n,
-                        const int num_lower_diags,
-                        const int num_upper_diags,
-                        const T* input,
-                        T* output)
-      : m_(m),
-        n_(n),
-        num_lower_diags_(num_lower_diags),
-        num_upper_diags_(num_upper_diags),
-        input_(input),
-        output_(output) {}
-
-  HOSTDEVICE void operator()(size_t index) const {
-    const int col = index % n_;
-    const int row = (index / n_) % m_;
-    const int band_start = (num_lower_diags_ < 0 ? 0 : row - num_lower_diags_);
-    const int band_end =
-        (num_upper_diags_ < 0 ? n_ : row + num_upper_diags_ + 1);
-    if (col < band_start || col >= band_end) {
-      output_[index] = static_cast<T>(0);
-    } else {
-      output_[index] = input_[index];
-    }
-  }
-
-  const int m_, n_, num_lower_diags_, num_upper_diags_;
-  const T* input_;
-  T* output_;
-};
-
-#define FUNC_WITH_TYPES(m) m(float, S) m(double, D)
-
-#define POTRF_INSTANCE(T, C)                                             \
-  void Potrf(const GPUContext& dev_ctx,                                  \
-             cublasFillMode_t uplo,                                      \
-             int n,                                                      \
-             T* A,                                                       \
-             int lda,                                                    \
-             int* info) {                                                \
-    auto handle = dev_ctx.cusolver_dn_handle();                          \
-    int workspace_size = 0;                                              \
-    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDn##C##potrf_bufferSize( \
-        handle, uplo, n, A, lda, &workspace_size));                      \
-    auto workspace = phi::memory_utils::Alloc(                           \
-        dev_ctx.GetPlace(),                                              \
-        workspace_size * sizeof(T),                                      \
-        phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream()))); \
-    T* workspace_ptr = reinterpret_cast<T*>(workspace->ptr());           \
-    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDn##C##potrf(            \
-        handle, uplo, n, A, lda, workspace_ptr, workspace_size, info));  \
-  }
-
-FUNC_WITH_TYPES(POTRF_INSTANCE);
-
-#if CUDA_VERSION >= 9020 && !defined(_WIN32)
-#define POTRF_BATCH_INSTANCE(T, C)                                   \
-  void PotrfBatched(const GPUContext& dev_ctx,                       \
-                    cublasFillMode_t uplo,                           \
-                    int n,                                           \
-                    T* Aarray[],                                     \
-                    int lda,                                         \
-                    int* info_array,                                 \
-                    int batch_size) {                                \
-    auto handle = dev_ctx.cusolver_dn_handle();                      \
-    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDn##C##potrfBatched( \
-        handle, uplo, n, Aarray, lda, info_array, batch_size));      \
-  }
-
-FUNC_WITH_TYPES(POTRF_BATCH_INSTANCE);
-#endif
-
-template <typename T, typename Context>
-void CholeskyKernel(const Context& dev_ctx,
-                    const DenseTensor& x,
-                    bool upper,
-                    DenseTensor* out) {
-  auto& dims = x.dims();
-  int batch_count = 1;
-  for (int i = 0; i < dims.size() - 2; i++) {
-    batch_count *= dims[i];
-  }
-  int m = dims[dims.size() - 1];
-  int tensor_size = batch_count * m * m;
-
-  const auto* x_data = x.data<T>();
-  auto* out_data = dev_ctx.template Alloc<T>(out);
-
-  // matrices are assumed to be stored in column-major order in cusolver
-  cublasFillMode_t uplo =
-      upper ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
-  // portf is inplace, thus copy the triangular part of the input matrices to
-  // the output and set the other triangular part to 0 firstly
-  phi::funcs::ForRange<GPUContext> for_range(dev_ctx, tensor_size);
-  if (upper) {
-    MatrixBandPartFunctor<T> matrix_band_part_functor(m,
-                                                      m,
-                                                      /* num_lower_diags */ 0,
-                                                      /* num_upper_diags */ m,
-                                                      x_data,
-                                                      out_data);
-    for_range(matrix_band_part_functor);
-  } else {
-    MatrixBandPartFunctor<T> matrix_band_part_functor(m,
-                                                      m,
-                                                      /* num_lower_diags */ m,
-                                                      /* num_upper_diags */ 0,
-                                                      x_data,
-                                                      out_data);
-    for_range(matrix_band_part_functor);
-  }
-
-  auto info = phi::memory_utils::Alloc(
-      dev_ctx.GetPlace(),
-      sizeof(int) * batch_count,
-      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
-  auto* info_ptr = reinterpret_cast<int*>(info->ptr());
-
-#if CUDA_VERSION >= 9020 && !defined(_WIN32)
-  if (batch_count > 1) {
-    std::vector<T*> output_ptrs;
-    for (int i = 0; i < batch_count; i++) {
-      output_ptrs.emplace_back(out_data + i * m * m);
-    }
-    thrust::device_vector<T*> dev_output_ptrs(output_ptrs.begin(),
-                                              output_ptrs.end());
-    PotrfBatched(dev_ctx,
-                 uplo,
-                 m,
-                 thrust::raw_pointer_cast(dev_output_ptrs.data()),
-                 m,
-                 info_ptr,
-                 batch_count);
-    // TODO(guosheng): There seems to a bug in cusolver potrfBatched and need
-    // to clear the upper triangle of the output. Remove this workaround once
-    // the bug is fixed.
-    if (!upper) {
-      MatrixBandPartFunctor<T> matrix_band_part_functor(m,
-                                                        m,
-                                                        /* num_lower_diags */ m,
-                                                        /* num_upper_diags */ 0,
-                                                        out_data,
-                                                        out_data);
-      for_range(matrix_band_part_functor);
-    }
-  } else {
-#endif
-    for (int i = 0; i < batch_count; i++) {
-      Potrf(dev_ctx, uplo, m, out_data + i * m * m, m, info_ptr + i);
-    }
-
-#if CUDA_VERSION >= 9020 && !defined(_WIN32)
-  }
-#endif
-  // check the info
-  std::vector<int> error_info;  // only for checking positive matrix
-  error_info.resize(batch_count);
-
-  memory_utils::Copy(CPUPlace(),
-                     error_info.data(),
-                     dev_ctx.GetPlace(),
-                     info_ptr,
-                     sizeof(int) * batch_count,
-                     dev_ctx.stream());
-
-  for (int i = 0; i < batch_count; ++i) {
-    PADDLE_ENFORCE_EQ(error_info[i],
-                      0,
-                      errors::PreconditionNotMet(
-                          "For batch [%d]: U(%d, %d) is zero, singular U.",
-                          i,
-                          error_info[i],
-                          error_info[i]));
-  }
-}
-
-}  // namespace phi
-
-PD_REGISTER_KERNEL(cholesky,  // cuda_only
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::CholeskyKernel,
-                   float,
-                   double) {}
-
-#endif  // not PADDLE_WITH_HIP
diff --git a/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu b/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu
deleted file mode 100644
index d9065b5441713..0000000000000
--- a/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu
+++ /dev/null
@@ -1,140 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifdef PADDLE_WITH_CUDA
-// HIP and MUSA not support cusolver
-
-#include "paddle/phi/backends/dynload/cusolver.h"
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/common/complex.h"
-#include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
-#include "paddle/phi/kernels/impl/cholesky_solve_kernel_impl.h"
-
-namespace phi {
-
-template <typename T>
-void cusolver_potrs(const solverHandle_t &handle,
-                    cublasFillMode_t uplo,
-                    int M,
-                    int N,
-                    T *Adata,
-                    int lda,
-                    T *Bdata,
-                    int ldb,
-                    int *devInfo);
-
-template <>
-void cusolver_potrs<float>(const solverHandle_t &handle,
-                           cublasFillMode_t uplo,
-                           int M,
-                           int N,
-                           float *Adata,
-                           int lda,
-                           float *Bdata,
-                           int ldb,
-                           int *devInfo) {
-  PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnSpotrs(
-      handle, uplo, M, N, Adata, lda, Bdata, ldb, devInfo));
-}
-
-template <>
-void cusolver_potrs<double>(const solverHandle_t &handle,
-                            cublasFillMode_t uplo,
-                            int M,
-                            int N,
-                            double *Adata,
-                            int lda,
-                            double *Bdata,
-                            int ldb,
-                            int *devInfo) {
-  PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnDpotrs(
-      handle, uplo, M, N, Adata, lda, Bdata, ldb, devInfo));
-}
-
-template <>
-void cusolver_potrs<phi::dtype::complex<float>>(
-    const solverHandle_t &handle,
-    cublasFillMode_t uplo,
-    int M,
-    int N,
-    phi::dtype::complex<float> *Adata,
-    int lda,
-    phi::dtype::complex<float> *Bdata,
-    int ldb,
-    int *devInfo) {
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      dynload::cusolverDnCpotrs(handle,
-                                uplo,
-                                M,
-                                N,
-                                reinterpret_cast<const cuComplex *>(Adata),
-                                lda,
-                                reinterpret_cast<cuComplex *>(Bdata),
-                                ldb,
-                                devInfo));
-}
-
-template <>
-void cusolver_potrs<phi::dtype::complex<double>>(
-    const cusolverDnHandle_t &handle,
-    cublasFillMode_t uplo,
-    int M,
-    int N,
-    phi::dtype::complex<double> *Adata,
-    int lda,
-    phi::dtype::complex<double> *Bdata,
-    int ldb,
-    int *devInfo) {
-  PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnZpotrs(
-      handle,
-      uplo,
-      M,
-      N,
-      reinterpret_cast<const cuDoubleComplex *>(Adata),
-      lda,
-      reinterpret_cast<cuDoubleComplex *>(Bdata),
-      ldb,
-      devInfo));
-}
-
-template <typename T>
-class CholeskySolveFunctor<T, GPUContext> {
- public:
-  void operator()(const GPUContext &dev_ctx,
-                  bool upper,
-                  int M,
-                  int N,
-                  T *Adata,
-                  int lda,
-                  T *Bdata,
-                  int *devInfo) {
-    cublasFillMode_t uplo =
-        upper ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
-    auto handle = dev_ctx.cusolver_dn_handle();
-    cusolver_potrs<T>(handle, uplo, M, N, Adata, lda, Bdata, lda, devInfo);
-  }
-};
-
-}  // namespace phi
-
-PD_REGISTER_KERNEL(cholesky_solve,  // cuda_only
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::CholeskySolveKernel,
-                   float,
-                   double) {}
-
-#endif  // not PADDLE_WITH_HIP
diff --git a/paddle/phi/kernels/gpu/cudnn_lstm_grad_kernel.cu b/paddle/phi/kernels/gpu/cudnn_lstm_grad_kernel.cu
deleted file mode 100644
index d2512e5f2cc08..0000000000000
--- a/paddle/phi/kernels/gpu/cudnn_lstm_grad_kernel.cu
+++ /dev/null
@@ -1,320 +0,0 @@
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#if 0
-#include "paddle/phi/kernels/cudnn_lstm_grad_kernel.h"
-
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/gpu/cudnn_lstm_utils.h"
-
-namespace phi {
-template <typename T, typename Context>
-void CudnnLSTMGradKernel(
-    const Context &ctx,
-    const DenseTensor &x,
-    const DenseTensor &init_h,
-    const DenseTensor &init_c,
-    const paddle::optional<std::vector<const DenseTensor *>> &weight_list,
-    const paddle::optional<DenseTensor> &sequence_length,
-    const DenseTensor &out,
-    const DenseTensor &reserve,
-    const DenseTensor &state_out,
-    const DenseTensor &out_grad,
-    const DenseTensor &last_h_grad,
-    const DenseTensor &last_c_grad,
-    float dropout_prob,
-    bool is_bidirec,
-    int hidden_size,
-    int num_layers,
-    bool is_test,
-    int seed,
-    DenseTensor *x_grad,
-    DenseTensor *init_h_grad,
-    DenseTensor *init_c_grad,
-    std::vector<DenseTensor *> weight_grad_list) {
-  auto input_dims = x.dims();
-  auto init_h_dims = init_h.dims();
-  auto init_c_dims = init_c.dims();
-
-  auto *init_h_data = init_h.data<T>();
-  auto *init_c_data = init_c.data<T>();
-  auto *out_data = out.data<T>();
-  auto *out_grad_data = out_grad.data<T>();
-  auto *last_h_grad_data = last_h_grad.data<T>();
-  auto *last_c_grad_data = last_c_grad.data<T>();
-
-  auto running_weight_list = *weight_list.get_ptr();
-  int weight_numel = size_sum(running_weight_list);
-  bool continuous = is_continuous<T, std::vector<const phi::DenseTensor *>>(
-      running_weight_list);
-
-  auto handle = ctx.cudnn_handle();
-  auto place = ctx.GetPlace();
-  auto stream = ctx.stream();
-  phi::DenseTensor weight_whole;
-  T *weight_data = nullptr;
-
-  if (!continuous) {
-    weight_whole.Resize({weight_numel});
-    ctx.template Alloc<T>(&weight_whole);
-    weight_to_tensor<T>(place, stream, running_weight_list, &weight_whole);
-    weight_data = weight_whole.data<T>();
-  } else {
-    weight_data = const_cast<T *>(running_weight_list[0]->data<T>());
-  }
-
-  phi::DenseTensor weight_grad;
-  phi::funcs::SetConstant<phi::GPUContext, T> zero;
-  weight_grad.Resize({weight_numel});
-  ctx.template Alloc<T>(&weight_grad);
-  zero(ctx, &weight_grad, static_cast<T>(0.0));
-  T *weight_grad_data = weight_grad.data<T>();
-
-  int offset = 0;
-  for (size_t i = 0; i < weight_grad_list.size(); ++i) {
-    size_t len = weight_grad_list[i]->numel();
-    auto dim = weight_grad_list[i]->dims();
-    weight_grad_list[i]
-        ->ShareDataWith(weight_grad.Slice(static_cast<int64_t>(offset),
-                                          static_cast<int64_t>(offset + len)))
-        .Resize(dim);
-    offset += len;
-  }
-
-  x_grad->Resize(input_dims);
-  ctx.template Alloc<T>(x_grad);
-  auto *in_grad_data = x_grad->data<T>();
-
-  if (init_h_grad) {
-    init_h_grad->Resize(init_h_dims);
-    ctx.template Alloc<T>(init_h_grad);
-  }
-  auto *init_h_grad_data = init_h_grad ? init_h_grad->data<T>() : nullptr;
-
-  if (init_c_grad) {
-    init_c_grad->Resize(init_c_dims);
-    ctx.template Alloc<T>(init_c_grad);
-  }
-  auto *init_c_grad_data = init_c_grad ? init_c_grad->data<T>() : nullptr;
-
-  auto running_seq_length = sequence_length.get_ptr();
-  bool has_seq_length = running_seq_length != nullptr;
-  std::vector<int> SequenceLength;
-  if (has_seq_length) {
-    SequenceLength = phi::GetVectorFromTensor<int>(running_seq_length);
-  }
-
-  int seq_length = input_dims[0];
-  int batch_size = x.dims()[1];
-  int input_size = x.dims()[2];
-
-  size_t workspace_size;
-  size_t reserve_size;
-
-  ScopedRNNBase rnn(seq_length,
-                    batch_size,
-                    input_size,
-                    hidden_size,
-                    num_layers,
-                    dropout_prob,
-                    seed,
-                    weight_numel,
-                    true,
-                    is_bidirec);
-
-  rnn.Create<T>(handle,
-                ctx.GetPlace(),
-                SequenceLength,
-                &workspace_size,
-                &reserve_size,
-                const_cast<phi::DenseTensor *>(&state_out));
-
-  phi::DenseTensor workspace_data_;
-  workspace_data_.Resize({static_cast<int64_t>(workspace_size)});
-  ctx.template Alloc<uint8_t>(&workspace_data_);
-  const uint8_t *reserve_data = reserve.data<uint8_t>();
-
-  if (!has_seq_length) {
-// This interface is used when the input/output is unpadded.
-#ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenRNNBackwardData(handle,
-                                            rnn.rnn_desc(),
-                                            seq_length,
-                                            rnn.y_descs(),
-                                            out_data,
-                                            rnn.y_descs(),
-                                            out_grad_data,
-                                            rnn.last_h_desc(),
-                                            last_h_grad_data,
-                                            rnn.last_c_desc(),
-                                            last_c_grad_data,
-                                            rnn.weight_desc(),
-                                            weight_data,
-                                            rnn.init_h_desc(),
-                                            init_h_data,
-                                            rnn.init_c_desc(),
-                                            init_c_data,
-                                            rnn.x_descs(),
-                                            in_grad_data,
-                                            rnn.init_h_desc(),
-                                            init_h_grad_data,
-                                            rnn.init_c_desc(),
-                                            init_c_grad_data,
-                                            workspace_data_.data<uint8_t>(),
-                                            workspace_size,
-                                            const_cast<uint8_t *>(reserve_data),
-                                            reserve_size));
-
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenRNNBackwardWeights(
-        handle,
-        rnn.rnn_desc(),
-        seq_length,
-        rnn.x_descs(),
-        x.data<T>(),
-        rnn.init_h_desc(),
-        init_h.data<T>(),
-        rnn.y_descs(),
-        out.data<T>(),
-        rnn.weight_desc(),
-        weight_grad_data,
-        workspace_data_.data<uint8_t>(),
-        workspace_size,
-        const_cast<uint8_t *>(reserve_data),
-        reserve_size));
-#else
-    PADDLE_ENFORCE_GPU_SUCCESS(
-#ifdef PADDLE_WITH_MUSA
-        phi::dynload::mudnnRNNBackwardData(handle,
-#else
-        phi::dynload::cudnnRNNBackwardData(handle,
-#endif
-                                           rnn.rnn_desc(),
-                                           seq_length,
-                                           rnn.y_descs(),
-                                           out_data,
-                                           rnn.y_descs(),
-                                           out_grad_data,
-                                           rnn.last_h_desc(),
-                                           last_h_grad_data,
-                                           rnn.last_c_desc(),
-                                           last_c_grad_data,
-                                           rnn.weight_desc(),
-                                           weight_data,
-                                           rnn.init_h_desc(),
-                                           init_h_data,
-                                           rnn.init_c_desc(),
-                                           init_c_data,
-                                           rnn.x_descs(),
-                                           in_grad_data,
-                                           rnn.init_h_desc(),
-                                           init_h_grad_data,
-                                           rnn.init_c_desc(),
-                                           init_c_grad_data,
-                                           workspace_data_.data<uint8_t>(),
-                                           workspace_size,
-                                           const_cast<uint8_t *>(reserve_data),
-                                           reserve_size));
-
-#ifdef PADDLE_WITH_MUSA
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnRNNBackwardWeights(
-#else
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNBackwardWeights(
-#endif
-        handle,
-        rnn.rnn_desc(),
-        seq_length,
-        rnn.x_descs(),
-        x.data<T>(),
-        rnn.init_h_desc(),
-        init_h.data<T>(),
-        rnn.y_descs(),
-        out.data<T>(),
-        workspace_data_.data<uint8_t>(),
-        workspace_size,
-        rnn.weight_desc(),
-        weight_grad_data,
-        const_cast<uint8_t *>(reserve_data),
-        reserve_size));
-#endif
-  } else {
-#if !defined(PADDLE_WITH_HIP) && CUDNN_VERSION >= 7201
-    // for train
-    // This interface is used when the input/output is padded.
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNBackwardDataEx(
-        handle,
-        rnn.rnn_desc(),
-        rnn.y_seq_desc(),
-        out_data,
-        rnn.y_seq_desc(),
-        out_grad_data,
-        nullptr,
-        nullptr,
-        rnn.last_h_desc(),
-        last_h_grad_data,
-        rnn.last_c_desc(),
-        last_c_grad_data,
-        rnn.weight_desc(),
-        weight_data,
-        rnn.init_h_desc(),
-        init_h_data,
-        rnn.init_c_desc(),
-        init_c_data,
-        rnn.x_seq_desc(),
-        in_grad_data,
-        rnn.init_h_desc(),
-        init_h_grad_data,
-        rnn.init_c_desc(),
-        init_c_grad_data,
-        nullptr,
-        nullptr,
-        workspace_data_.data<uint8_t>(),
-        workspace_size,
-        const_cast<uint8_t *>(reserve_data),
-        reserve_size));
-
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNBackwardWeightsEx(
-        handle,
-        rnn.rnn_desc(),
-        rnn.x_seq_desc(),
-        x.data<T>(),
-        rnn.init_h_desc(),
-        init_h.data<T>(),
-        rnn.y_seq_desc(),
-        out.data<T>(),
-        workspace_data_.data<uint8_t>(),
-        workspace_size,
-        rnn.weight_desc(),
-        weight_grad_data,
-        const_cast<uint8_t *>(reserve_data),
-        reserve_size));
-#else
-    PADDLE_THROW(phi::errors::Unavailable(
-        "The padded input of rnn is supported by cudnnRNNBackwardDataEx, "
-        "cudnnRNNBackwardWeightsEx, but it only works when the version "
-        "of cudnn is larger than 7.2.1"));
-#endif
-  }
-}
-
-}  // namespace phi
-#ifdef PADDLE_WITH_HIP
-PD_REGISTER_KERNEL(
-    cudnn_lstm_grad, GPU, ALL_LAYOUT, phi::CudnnLSTMGradKernel, float) {}
-#else // CUDA & MUSA
-PD_REGISTER_KERNEL(
-    cudnn_lstm_grad, GPU, ALL_LAYOUT, phi::CudnnLSTMGradKernel, float, double) {
-}
-#endif
-#endif
diff --git a/paddle/phi/kernels/gpu/cudnn_lstm_kernel.cu b/paddle/phi/kernels/gpu/cudnn_lstm_kernel.cu
deleted file mode 100644
index 37c7322309f8c..0000000000000
--- a/paddle/phi/kernels/gpu/cudnn_lstm_kernel.cu
+++ /dev/null
@@ -1,426 +0,0 @@
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#if 0
-#include "paddle/phi/kernels/cudnn_lstm_kernel.h"
-
-#include "glog/logging.h"
-
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/gpu/cudnn_lstm_utils.h"
-
-namespace phi {
-
-template <typename T>
-#ifdef PADDLE_WITH_HIP
-void LSTMInferece(const bool &has_seq_length,
-                  const miopenHandle_t &handle,
-#elif defined(PADDLE_WITH_MUSA)
-void LSTMInferece(const bool &has_seq_length,
-                  const mudnnHandle_t &handle,
-#else
-void LSTMInferece(const bool &has_seq_length,
-                  const cudnnHandle_t &handle,
-#endif
-                  const int &seq_length,
-                  ScopedRNNBase *rnn,
-                  const T *x_data,
-                  const T *init_h_data,
-                  const T *init_c_data,
-                  const T *w_data,
-                  T *out_data,
-                  T *last_h_data,
-                  T *last_c_data,
-                  phi::DenseTensor *workspace_data,
-                  const size_t &workspace_size) {
-  if (!has_seq_length) {
-// for inference
-// This interface is used when the input/output is unpadded.
-#ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenRNNForwardInference(handle,
-                                                rnn->rnn_desc(),
-                                                seq_length,
-                                                rnn->x_descs(),
-                                                x_data,
-                                                rnn->init_h_desc(),
-                                                init_h_data,
-                                                rnn->init_c_desc(),
-                                                init_c_data,
-                                                rnn->weight_desc(),
-                                                w_data,
-                                                rnn->y_descs(),
-                                                out_data,
-                                                rnn->last_h_desc(),
-                                                last_h_data,
-                                                rnn->last_c_desc(),
-                                                last_c_data,
-                                                workspace_data->data<uint8_t>(),
-                                                workspace_size));
-#elif defined(PADDLE_WITH_MUSA)
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::mudnnRNNForwardInference(handle,
-                                               rnn->rnn_desc(),
-                                               seq_length,
-                                               rnn->x_descs(),
-                                               x_data,
-                                               rnn->init_h_desc(),
-                                               init_h_data,
-                                               rnn->init_c_desc(),
-                                               init_c_data,
-                                               rnn->weight_desc(),
-                                               w_data,
-                                               rnn->y_descs(),
-                                               out_data,
-                                               rnn->last_h_desc(),
-                                               last_h_data,
-                                               rnn->last_c_desc(),
-                                               last_c_data,
-                                               workspace_data->data<uint8_t>(),
-                                               workspace_size));
-#elif defined(PADDLE_WITH_MUSA)
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::mudnnRNNForwardInference(handle,
-                                               rnn->rnn_desc(),
-                                               seq_length,
-                                               rnn->x_descs(),
-                                               x_data,
-                                               rnn->init_h_desc(),
-                                               init_h_data,
-                                               rnn->init_c_desc(),
-                                               init_c_data,
-                                               rnn->weight_desc(),
-                                               w_data,
-                                               rnn->y_descs(),
-                                               out_data,
-                                               rnn->last_h_desc(),
-                                               last_h_data,
-                                               rnn->last_c_desc(),
-                                               last_c_data,
-                                               workspace_data->data<uint8_t>(),
-                                               workspace_size));
-#else
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cudnnRNNForwardInference(handle,
-                                               rnn->rnn_desc(),
-                                               seq_length,
-                                               rnn->x_descs(),
-                                               x_data,
-                                               rnn->init_h_desc(),
-                                               init_h_data,
-                                               rnn->init_c_desc(),
-                                               init_c_data,
-                                               rnn->weight_desc(),
-                                               w_data,
-                                               rnn->y_descs(),
-                                               out_data,
-                                               rnn->last_h_desc(),
-                                               last_h_data,
-                                               rnn->last_c_desc(),
-                                               last_c_data,
-                                               workspace_data->data<uint8_t>(),
-                                               workspace_size));
-#endif
-  } else {
-#if !defined(PADDLE_WITH_HIP) && CUDNN_VERSION >= 7201
-    // for inference
-    // This interface is used when the input/output is padded.
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNForwardInferenceEx(
-        handle,
-        rnn->rnn_desc(),
-        rnn->x_seq_desc(),
-        x_data,
-        rnn->init_h_desc(),
-        init_h_data,
-        rnn->init_c_desc(),
-        init_c_data,
-        rnn->weight_desc(),
-        w_data,
-        rnn->y_seq_desc(),
-        out_data,
-        rnn->last_h_desc(),
-        last_h_data,
-        rnn->last_c_desc(),
-        last_c_data,
-        nullptr,
-        nullptr,
-        nullptr,
-        nullptr,
-        nullptr,
-        nullptr,
-        nullptr,
-        nullptr,
-        workspace_data->data<uint8_t>(),
-        workspace_size));
-#else
-    // CUDNN VERSION has to >=7.2.1
-    PADDLE_THROW(phi::errors::Unavailable(
-        "The padded input is supported by "
-        "cudnnRNNForwardInferenceEx, but it only works when "
-        "the version of cudnn is larger than 7.2.1"));
-#endif
-  }
-}
-
-template <typename T, typename Context>
-void CudnnLSTMKernel(
-    const Context &ctx,
-    const DenseTensor &x,
-    const DenseTensor &init_h,
-    const DenseTensor &init_c,
-    const paddle::optional<DenseTensor> &w,
-    const paddle::optional<std::vector<const DenseTensor *>> &weight_list,
-    const paddle::optional<DenseTensor> &sequence_length,
-    float dropout_prob,
-    bool is_bidirec,
-    int hidden_size,
-    int num_layers,
-    bool is_test,
-    int seed,
-    DenseTensor *out,
-    DenseTensor *last_h,
-    DenseTensor *last_c,
-    DenseTensor *reserve,
-    DenseTensor *state_out) {
-  const T *x_data = x.data<T>();
-  const T *init_h_data = init_h.data<T>();
-  const T *init_c_data = init_c.data<T>();
-
-  T *out_data = ctx.template Alloc<T>(out);
-  T *last_h_data = ctx.template Alloc<T>(last_h);
-  T *last_c_data = ctx.template Alloc<T>(last_c);
-
-  if (!is_test) {
-    if (seed == 0) {
-      // If not specify seed, use global Generator to generate seed.
-      int device_id = ctx.GetPlace().GetDeviceId();
-      auto gen_cuda = phi::DefaultCUDAGenerator(device_id);
-      seed = static_cast<int>(gen_cuda->Random64());
-    }
-  }
-
-  auto *running_sequence_length = sequence_length.get_ptr();
-  bool has_seq_length = running_sequence_length != nullptr;
-  std::vector<int> SequenceLength;
-  if (has_seq_length) {
-    SequenceLength = phi::GetVectorFromTensor<int>(running_sequence_length);
-  }
-
-  auto handle = ctx.cudnn_handle();
-
-  int seq_length = x.dims()[0];
-  int batch_size = x.dims()[1];
-  int input_size = x.dims()[2];
-  bool state_initialized = state_out->initialized() ? true : false;
-
-  size_t workspace_size;
-  size_t reserve_size;
-  phi::DenseTensor weight_whole;
-  T *w_data = nullptr;
-  int weight_numel;
-  bool w_initialized = false;
-  auto place = ctx.GetPlace();
-  auto stream = ctx.stream();
-  auto *running_w = w.get_ptr();
-  if (is_test && running_w != nullptr) {
-    w_initialized = running_w->initialized() ? true : false;
-    weight_numel = running_w->numel();
-  }
-  if (!w_initialized) {
-    auto running_weight_list = *weight_list.get_ptr();
-    bool continuous = is_continuous<T, std::vector<const phi::DenseTensor *>>(
-        running_weight_list);
-    weight_numel = size_sum(running_weight_list);
-
-    if (!continuous) {
-      LOG_FIRST_N(WARNING, 2)
-          << "If the memory space of the Input WeightList is not continuous, "
-             "less efficient calculation will be called. Please call "
-             "flatten_parameters() to make the input memory continuous.";
-      weight_whole.Resize({weight_numel});
-      ctx.template Alloc<T>(&weight_whole);
-      weight_to_tensor<T>(place, stream, running_weight_list, &weight_whole);
-      w_data = weight_whole.data<T>();
-      if (is_test) {  // maybe also reset small weights' ptr for training
-        int offset = 0;
-        for (size_t i = 0; i < running_weight_list.size(); ++i) {
-          size_t len = running_weight_list[i]->numel();
-          auto dim = running_weight_list[i]->dims();
-          const_cast<phi::DenseTensor *>(running_weight_list[i])
-              ->ShareDataWith(
-                  weight_whole.Slice(static_cast<int64_t>(offset),
-                                     static_cast<int64_t>(offset + len)))
-              .Resize(dim);
-          offset += len;
-        }
-      }
-    } else {
-      w_data = const_cast<T *>(running_weight_list[0]->data<T>());
-    }
-  } else {
-    w_data = const_cast<T *>(running_w->data<T>());
-  }
-
-  ScopedRNNBase rnn(seq_length,
-                    batch_size,
-                    input_size,
-                    hidden_size,
-                    num_layers,
-                    dropout_prob,
-                    seed,
-                    weight_numel,
-                    state_initialized,
-                    is_bidirec);
-  rnn.Create<T>(handle,
-                ctx.GetPlace(),
-                SequenceLength,
-                &workspace_size,
-                &reserve_size,
-                state_out);
-
-  phi::DenseTensor workspace_data_;
-  workspace_data_.Resize({static_cast<int64_t>(workspace_size)});
-  ctx.template Alloc<uint8_t>(&workspace_data_);
-
-  reserve->Resize({static_cast<int64_t>(reserve_size)});
-  auto *reserve_data = ctx.template Alloc<uint8_t>(reserve);
-
-  if (is_test) {
-    LSTMInferece<T>(has_seq_length,
-                    handle,
-                    seq_length,
-                    &rnn,
-                    x_data,
-                    init_h_data,
-                    init_c_data,
-                    w_data,
-                    out_data,
-                    last_h_data,
-                    last_c_data,
-                    &workspace_data_,
-                    workspace_size);
-  } else {
-    if (!has_seq_length) {
-// for train
-// This interface is used when the input/output is unpadded.
-#ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenRNNForwardTraining(
-          handle,
-          rnn.rnn_desc(),
-          seq_length,
-          rnn.x_descs(),
-          x_data,
-          rnn.init_h_desc(),
-          init_h_data,
-          rnn.init_c_desc(),
-          init_c_data,
-          rnn.weight_desc(),
-          w_data,
-          rnn.y_descs(),
-          out_data,
-          rnn.last_h_desc(),
-          last_h_data,
-          rnn.last_c_desc(),
-          last_c_data,
-          workspace_data_.data<uint8_t>(),
-          workspace_size,
-          reserve_data,
-          reserve_size));
-#else
-      PADDLE_ENFORCE_GPU_SUCCESS(
-#ifdef PADDLE_WITH_MUSA
-          phi::dynload::mudnnRNNForwardTraining(handle,
-#else
-          phi::dynload::cudnnRNNForwardTraining(handle,
-#endif
-                                                rnn.rnn_desc(),
-                                                seq_length,
-                                                rnn.x_descs(),
-                                                x_data,
-                                                rnn.init_h_desc(),
-                                                init_h_data,
-                                                rnn.init_c_desc(),
-                                                init_c_data,
-                                                rnn.weight_desc(),
-                                                w_data,
-                                                rnn.y_descs(),
-                                                out_data,
-                                                rnn.last_h_desc(),
-                                                last_h_data,
-                                                rnn.last_c_desc(),
-                                                last_c_data,
-                                                workspace_data_.data<uint8_t>(),
-                                                workspace_size,
-                                                reserve_data,
-                                                reserve_size));
-#endif
-    } else {
-#if !defined(PADDLE_WITH_HIP) && CUDNN_VERSION >= 7201
-      // for train
-      // This interface is used when the input/output is padded.
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNForwardTrainingEx(
-          handle,
-          rnn.rnn_desc(),
-          rnn.x_seq_desc(),
-          x_data,
-          rnn.init_h_desc(),
-          init_h_data,
-          rnn.init_c_desc(),
-          init_c_data,
-          rnn.weight_desc(),
-          w_data,
-          rnn.y_seq_desc(),
-          out_data,
-          rnn.last_h_desc(),
-          last_h_data,
-          rnn.last_c_desc(),
-          last_c_data,
-          nullptr,
-          nullptr,
-          nullptr,
-          nullptr,
-          nullptr,
-          nullptr,
-          nullptr,
-          nullptr,
-          workspace_data_.data<uint8_t>(),
-          workspace_size,
-          reserve_data,
-          reserve_size));
-#else
-      PADDLE_THROW(phi::errors::Unavailable(
-          "The padded input is supported by "
-          "cudnnRNNForwardTrainingEx, but it only works when "
-          "the version of cudnn is larger than 7.2.1"));
-#endif
-    }
-  }
-}
-
-}  // namespace phi
-
-#ifdef PADDLE_WITH_HIP
-PD_REGISTER_KERNEL(cudnn_lstm, GPU, ALL_LAYOUT, phi::CudnnLSTMKernel, float) {
-  kernel->InputAt(5).SetDataType(phi::DataType::INT32);
-  kernel->OutputAt(3).SetDataType(phi::DataType::UINT8);
-  kernel->OutputAt(4).SetDataType(phi::DataType::UINT8);
-}
-#else // CUDA & MUSA
-PD_REGISTER_KERNEL(
-    cudnn_lstm, GPU, ALL_LAYOUT, phi::CudnnLSTMKernel, float, double) {
-  kernel->InputAt(5).SetDataType(phi::DataType::INT32);
-  kernel->OutputAt(3).SetDataType(phi::DataType::UINT8);
-  kernel->OutputAt(4).SetDataType(phi::DataType::UINT8);
-}
-#endif
-#endif
diff --git a/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu b/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu
index 33f7cbccd0f5e..5152a133cf92b 100644
--- a/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu
+++ b/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu
@@ -160,7 +160,7 @@ void GraphSendUERecvOpCUDAKernelLaunchHelper(const Context& ctx,
       int* dst_count_data = dst_count->data<int>();
 #ifdef PADDLE_WITH_HIP
       hipMemset(dst_count_data, 0, input_size * sizeof(int));
-#elif defined(PADDLE_WITH_HIP)
+#elif defined(PADDLE_WITH_MUSA)
       musaMemset(dst_count_data, 0, input_size * sizeof(int));
 #else
       cudaMemset(dst_count_data, 0, input_size * sizeof(int));
diff --git a/paddle/phi/kernels/gpu/sgd_kernel.cu b/paddle/phi/kernels/gpu/sgd_kernel.cu
index d36687461bf6c..d86b7cfddcd78 100644
--- a/paddle/phi/kernels/gpu/sgd_kernel.cu
+++ b/paddle/phi/kernels/gpu/sgd_kernel.cu
@@ -197,6 +197,22 @@ PD_REGISTER_KERNEL(sgd,
 }
 #endif
 
+#ifdef PADDLE_WITH_MUSA
+PD_REGISTER_KERNEL(sgd,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SGDDenseKernel,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   float,
+                   double) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16 ||
+      kernel_key.dtype() == phi::DataType::BFLOAT16) {
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+  }
+}
+#endif
+
 #ifdef PADDLE_WITH_HIP
 PD_REGISTER_KERNEL(sgd,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/svd_kernel.cu b/paddle/phi/kernels/gpu/svd_kernel.cu
deleted file mode 100644
index 5f076850d438f..0000000000000
--- a/paddle/phi/kernels/gpu/svd_kernel.cu
+++ /dev/null
@@ -1,268 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef PADDLE_WITH_HIP
-// HIP not support cusolver
-
-#include "paddle/phi/kernels/svd_kernel.h"
-
-#include "paddle/phi/backends/dynload/cusolver.h"
-#include "paddle/phi/common/memory_utils.h"
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/empty_kernel.h"
-#include "paddle/phi/kernels/funcs/complex_functors.h"
-#include "paddle/phi/kernels/transpose_kernel.h"
-
-namespace phi {
-
-template <class T>
-static void GesvdjBatched(const phi::GPUContext& dev_ctx,
-                          int batchSize,
-                          int m,
-                          int n,
-                          int k,
-                          T* A,
-                          T* U,
-                          T* V,
-                          T* S,
-                          int* info,
-                          int thin_UV = 1);
-
-template <>
-void GesvdjBatched<float>(const phi::GPUContext& dev_ctx,
-                          int batchSize,
-                          int m,
-                          int n,
-                          int k,
-                          float* A,
-                          float* U,
-                          float* V,
-                          float* S,
-                          int* info,
-                          int thin_UV) {
-  /* compute singular vectors */
-  const cusolverEigMode_t jobz =
-      CUSOLVER_EIG_MODE_VECTOR; /* compute singular vectors */
-  gesvdjInfo_t gesvdj_params = NULL;
-  int lda = m;
-  int ldu = m;
-  int ldt = n;
-  int lwork = 0;
-  auto handle = dev_ctx.cusolver_dn_handle();
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params));
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cusolverDnSgesvdj_bufferSize(handle,
-                                                 jobz,
-                                                 thin_UV,
-                                                 m,
-                                                 n,
-                                                 A,
-                                                 lda,
-                                                 S,
-                                                 U,
-                                                 ldu,
-                                                 V,
-                                                 ldt,
-                                                 &lwork,
-                                                 gesvdj_params));
-  auto workspace = phi::memory_utils::Alloc(
-      dev_ctx.GetPlace(),
-      lwork * sizeof(float),
-      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
-  float* workspace_ptr = reinterpret_cast<float*>(workspace->ptr());
-  int stride_A = lda * n;
-  int stride_U = ldu * (thin_UV ? k : m);
-  int stride_V = ldt * (thin_UV ? k : n);
-  for (int i = 0; i < batchSize; ++i) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnSgesvdj(handle,
-                                                               jobz,
-                                                               thin_UV,
-                                                               m,
-                                                               n,
-                                                               A + stride_A * i,
-                                                               lda,
-                                                               S + k * i,
-                                                               U + stride_U * i,
-                                                               ldu,
-                                                               V + stride_V * i,
-                                                               ldt,
-                                                               workspace_ptr,
-                                                               lwork,
-                                                               info,
-                                                               gesvdj_params));
-    // check the error info
-    int error_info;
-    memory_utils::Copy(phi::CPUPlace(),
-                       &error_info,
-                       dev_ctx.GetPlace(),
-                       info,
-                       sizeof(int),
-                       dev_ctx.stream());
-    PADDLE_ENFORCE_EQ(
-        error_info,
-        0,
-        phi::errors::PreconditionNotMet(
-            "For batch [%d]: CUSolver SVD is not zero. [%d]", i, error_info));
-  }
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cusolverDnDestroyGesvdjInfo(gesvdj_params));
-}
-
-template <>
-void GesvdjBatched<double>(const phi::GPUContext& dev_ctx,
-                           int batchSize,
-                           int m,
-                           int n,
-                           int k,
-                           double* A,
-                           double* U,
-                           double* V,
-                           double* S,
-                           int* info,
-                           int thin_UV) {
-  /* compute singular vectors */
-  const cusolverEigMode_t jobz =
-      CUSOLVER_EIG_MODE_VECTOR; /* compute singular vectors */
-  gesvdjInfo_t gesvdj_params = NULL;
-  int lda = m;
-  int ldu = m;
-  int ldt = n;
-  int lwork = 0;
-  auto handle = dev_ctx.cusolver_dn_handle();
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params));
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cusolverDnDgesvdj_bufferSize(handle,
-                                                 jobz,
-                                                 thin_UV,
-                                                 m,
-                                                 n,
-                                                 A,
-                                                 lda,
-                                                 S,
-                                                 U,
-                                                 ldu,
-                                                 V,
-                                                 ldt,
-                                                 &lwork,
-                                                 gesvdj_params));
-  auto workspace = phi::memory_utils::Alloc(
-      dev_ctx.GetPlace(),
-      lwork * sizeof(double),
-      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
-  double* workspace_ptr = reinterpret_cast<double*>(workspace->ptr());
-  int stride_A = lda * n;
-  int stride_U = ldu * (thin_UV ? k : m);
-  int stride_V = ldt * (thin_UV ? k : n);
-  for (int i = 0; i < batchSize; ++i) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnDgesvdj(handle,
-                                                               jobz,
-                                                               thin_UV,
-                                                               m,
-                                                               n,
-                                                               A + stride_A * i,
-                                                               lda,
-                                                               S + k * i,
-                                                               U + stride_U * i,
-                                                               ldu,
-                                                               V + stride_V * i,
-                                                               ldt,
-                                                               workspace_ptr,
-                                                               lwork,
-                                                               info,
-                                                               gesvdj_params));
-    // check the error info
-    int error_info;
-    memory_utils::Copy(phi::CPUPlace(),
-                       &error_info,
-                       dev_ctx.GetPlace(),
-                       info,
-                       sizeof(int),
-                       dev_ctx.stream());
-    PADDLE_ENFORCE_EQ(
-        error_info,
-        0,
-        phi::errors::PreconditionNotMet(
-            "For batch [%d]: CUSolver SVD is not zero. [%d]", i, error_info));
-  }
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cusolverDnDestroyGesvdjInfo(gesvdj_params));
-}
-
-template <typename T, typename Context>
-void SvdKernel(const Context& dev_ctx,
-               const DenseTensor& X,
-               bool full_matrices,
-               DenseTensor* U,
-               DenseTensor* S,
-               DenseTensor* VH) {
-  auto& dims = X.dims();
-  int batch_count = 1;
-  for (int i = 0; i < dims.size() - 2; i++) {
-    batch_count *= dims[i];
-  }
-  int rank = dims.size();
-  int m = dims[rank - 2];
-  int n = dims[rank - 1];
-
-  PADDLE_ENFORCE_LT(
-      0,
-      m,
-      errors::InvalidArgument("The row of Input(X) should be greater than 0."));
-  PADDLE_ENFORCE_LT(
-      0,
-      n,
-      errors::InvalidArgument("The col of Input(X) should be greater than 0."));
-
-  auto* u_data = dev_ctx.template Alloc<phi::dtype::Real<T>>(U);
-  auto* vh_data = dev_ctx.template Alloc<phi::dtype::Real<T>>(VH);
-  auto* s_data = dev_ctx.template Alloc<phi::dtype::Real<T>>(S);
-  // NOTE:(@xiongkun03)
-  // matrices are assumed to be stored in column-major order in cusolver
-  // then view A as n x m and do A^T SVD, we can avoid transpose
-  // Must Copy X once, because the gesvdj will change the origin input matrix
-  DenseTensor x_tmp;
-  Copy(dev_ctx, X, dev_ctx.GetPlace(), false, &x_tmp);
-  auto info = Empty<int, Context>(dev_ctx, {batch_count});
-  int* info_ptr = reinterpret_cast<int*>(info.data());
-
-  GesvdjBatched<T>(dev_ctx,
-                   batch_count,
-                   n,
-                   m,
-                   std::min(m, n),
-                   dev_ctx.template Alloc<T>(&x_tmp),
-                   vh_data,
-                   u_data,
-                   s_data,
-                   info_ptr,
-                   !full_matrices);
-
-  auto UT_dim = U->dims();
-  std::swap(UT_dim[rank - 1], UT_dim[rank - 2]);  // Get the dim of UT_dim
-  U->Resize(UT_dim);                              // U is entirely UT
-  auto tmp_U = TransposeLast2Dim<T>(dev_ctx, *U);
-  U->ShareDataWith(tmp_U);  // U becomse UT, aka VT;
-}
-}  // namespace phi
-
-PD_REGISTER_KERNEL(svd,  // cuda_only
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::SvdKernel,
-                   float,
-                   double) {}
-
-#endif  // not PADDLE_WITH_HIP
diff --git a/paddle/phi/kernels/impl/matmul_kernel_impl.h b/paddle/phi/kernels/impl/matmul_kernel_impl.h
index cac393d636051..9fbbcc95fb323 100644
--- a/paddle/phi/kernels/impl/matmul_kernel_impl.h
+++ b/paddle/phi/kernels/impl/matmul_kernel_impl.h
@@ -128,16 +128,16 @@ void MatMulFunctionImplWithBlas(
     VLOG(3) << "MatMul's case 1";
     Out->Resize(phi::make_ddim({}));
     dev_ctx.template Alloc<T>(Out);
-    blas.GEMM(CblasNoTrans,
-              CblasTrans,
-              1,
-              1,
-              M,
-              static_cast<T>(1),
-              y_data,
-              x_data,
-              static_cast<T>(flag),
-              dev_ctx.template Alloc<T>(Out));
+    //blas.GEMM(CblasNoTrans,
+    //          CblasTrans,
+    //          1,
+    //          1,
+    //          M,
+    //          static_cast<T>(1),
+    //          y_data,
+    //          x_data,
+    //          static_cast<T>(flag),
+    //          dev_ctx.template Alloc<T>(Out));
     return;
   }
 
@@ -178,42 +178,42 @@ void MatMulFunctionImplWithBlas(
     if (trans_y) {
       const int M = Y.numel() / N;
       VLOG(3) << "MatMul's case 2";
-      blas.GEMV(false,
-                M,
-                N,
-                static_cast<T>(1),
-                y_data,
-                x_data,
-                static_cast<T>(flag),
-                dev_ctx.template Alloc<T>(Out));
+      //blas.GEMV(false,
+      //          M,
+      //          N,
+      //          static_cast<T>(1),
+      //          y_data,
+      //          x_data,
+      //          static_cast<T>(flag),
+      //          dev_ctx.template Alloc<T>(Out));
     } else {
       const int M = y_dims[y_ndim - 1];
       const int batch_size = Y.numel() / (M * N);
       if (batch_size == 1) {
         VLOG(3) << "MatMul's case 3";
-        blas.GEMV(true,
-                  N,
-                  M,
-                  static_cast<T>(1),
-                  y_data,
-                  x_data,
-                  static_cast<T>(flag),
-                  dev_ctx.template Alloc<T>(Out));
+        //blas.GEMV(true,
+        //          N,
+        //          M,
+        //          static_cast<T>(1),
+        //          y_data,
+        //          x_data,
+        //          static_cast<T>(flag),
+        //          dev_ctx.template Alloc<T>(Out));
       } else {
         VLOG(3) << "MatMul's case 4";
-        blas.BatchedGEMM(CblasTrans,
-                         CblasNoTrans,
-                         M,
-                         1,
-                         N,
-                         static_cast<T>(1),
-                         y_data,
-                         x_data,
-                         static_cast<T>(flag),
-                         dev_ctx.template Alloc<T>(Out),
-                         batch_size,
-                         M * N,
-                         0);
+        //blas.BatchedGEMM(CblasTrans,
+        //                 CblasNoTrans,
+        //                 M,
+        //                 1,
+        //                 N,
+        //                 static_cast<T>(1),
+        //                 y_data,
+        //                 x_data,
+        //                 static_cast<T>(flag),
+        //                 dev_ctx.template Alloc<T>(Out),
+        //                 batch_size,
+        //                 M * N,
+        //                 0);
       }
     }
     return;
@@ -259,41 +259,41 @@ void MatMulFunctionImplWithBlas(
       const int batch_size = X.numel() / (M * N);
       if (batch_size == 1) {
         VLOG(3) << "MatMul's case 5";
-        blas.GEMV(true,
-                  N,
-                  M,
-                  static_cast<T>(1),
-                  x_data,
-                  y_data,
-                  static_cast<T>(flag),
-                  dev_ctx.template Alloc<T>(Out));
+        //blas.GEMV(true,
+        //          N,
+        //          M,
+        //          static_cast<T>(1),
+        //          x_data,
+        //          y_data,
+        //          static_cast<T>(flag),
+        //          dev_ctx.template Alloc<T>(Out));
       } else {
         VLOG(3) << "MatMul's case 6";
-        blas.BatchedGEMM(CblasTrans,
-                         CblasNoTrans,
-                         M,
-                         1,
-                         N,
-                         static_cast<T>(1),
-                         x_data,
-                         y_data,
-                         static_cast<T>(flag),
-                         dev_ctx.template Alloc<T>(Out),
-                         batch_size,
-                         M * N,
-                         0);
+        //blas.BatchedGEMM(CblasTrans,
+        //                 CblasNoTrans,
+        //                 M,
+        //                 1,
+        //                 N,
+        //                 static_cast<T>(1),
+        //                 x_data,
+        //                 y_data,
+        //                 static_cast<T>(flag),
+        //                 dev_ctx.template Alloc<T>(Out),
+        //                 batch_size,
+        //                 M * N,
+        //                 0);
       }
     } else {
       const int M = X.numel() / N;
       VLOG(3) << "MatMul's case 7";
-      blas.GEMV(false,
-                M,
-                N,
-                static_cast<T>(1),
-                x_data,
-                y_data,
-                static_cast<T>(flag),
-                dev_ctx.template Alloc<T>(Out));
+      //blas.GEMV(false,
+      //          M,
+      //          N,
+      //          static_cast<T>(1),
+      //          x_data,
+      //          y_data,
+      //          static_cast<T>(flag),
+      //          dev_ctx.template Alloc<T>(Out));
     }
     return;
   }
@@ -367,87 +367,87 @@ void MatMulFunctionImplWithBlas(
   if (out_batch_size == 0) return;
   if (x_batch_size == 1 && y_batch_size == 1) {
     VLOG(3) << "MatMul's case 8";
-    blas.GEMM(trans_x ? CblasTrans : CblasNoTrans,
-              trans_y ? CblasTrans : CblasNoTrans,
-              M,
-              N,
-              K,
-              static_cast<T>(1),
-              x_data,
-              y_data,
-              static_cast<T>(flag),
-              dev_ctx.template Alloc<T>(Out));
+    //blas.GEMM(trans_x ? CblasTrans : CblasNoTrans,
+    //          trans_y ? CblasTrans : CblasNoTrans,
+    //          M,
+    //          N,
+    //          K,
+    //          static_cast<T>(1),
+    //          x_data,
+    //          y_data,
+    //          static_cast<T>(flag),
+    //          dev_ctx.template Alloc<T>(Out));
   } else if (x_batch_size == 1) {
     if (M == 1 && trans_y) {
       VLOG(3) << "MatMul's case 9";
-      blas.GEMV(false,
-                y_batch_size * N,
-                K,
-                static_cast<T>(1),
-                y_data,
-                x_data,
-                static_cast<T>(flag),
-                dev_ctx.template Alloc<T>(Out));
+      //blas.GEMV(false,
+      //          y_batch_size * N,
+      //          K,
+      //          static_cast<T>(1),
+      //          y_data,
+      //          x_data,
+      //          static_cast<T>(flag),
+      //          dev_ctx.template Alloc<T>(Out));
     } else {
       VLOG(3) << "MatMul's case 10";
-      blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
-                       trans_y ? CblasTrans : CblasNoTrans,
-                       M,
-                       N,
-                       K,
-                       static_cast<T>(1),
-                       x_data,
-                       y_data,
-                       static_cast<T>(flag),
-                       dev_ctx.template Alloc<T>(Out),
-                       out_batch_size,
-                       0,
-                       K * N);
+      //blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
+      //                 trans_y ? CblasTrans : CblasNoTrans,
+      //                 M,
+      //                 N,
+      //                 K,
+      //                 static_cast<T>(1),
+      //                 x_data,
+      //                 y_data,
+      //                 static_cast<T>(flag),
+      //                 dev_ctx.template Alloc<T>(Out),
+      //                 out_batch_size,
+      //                 0,
+      //                 K * N);
     }
   } else if (y_batch_size == 1) {
     if (!trans_x) {
       VLOG(3) << "MatMul's case 11";
-      blas.GEMM(CblasNoTrans,
-                trans_y ? CblasTrans : CblasNoTrans,
-                x_batch_size * M,
-                N,
-                K,
-                static_cast<T>(1),
-                x_data,
-                y_data,
-                static_cast<T>(flag),
-                dev_ctx.template Alloc<T>(Out));
+      //blas.GEMM(CblasNoTrans,
+      //          trans_y ? CblasTrans : CblasNoTrans,
+      //          x_batch_size * M,
+      //          N,
+      //          K,
+      //          static_cast<T>(1),
+      //          x_data,
+      //          y_data,
+      //          static_cast<T>(flag),
+      //          dev_ctx.template Alloc<T>(Out));
     } else {
       VLOG(3) << "MatMul's case 12";
-      blas.BatchedGEMM(CblasTrans,
-                       trans_y ? CblasTrans : CblasNoTrans,
-                       M,
-                       N,
-                       K,
-                       static_cast<T>(1),
-                       x_data,
-                       y_data,
-                       static_cast<T>(flag),
-                       dev_ctx.template Alloc<T>(Out),
-                       out_batch_size,
-                       M * K,
-                       0);
+      //blas.BatchedGEMM(CblasTrans,
+      //                 trans_y ? CblasTrans : CblasNoTrans,
+      //                 M,
+      //                 N,
+      //                 K,
+      //                 static_cast<T>(1),
+      //                 x_data,
+      //                 y_data,
+      //                 static_cast<T>(flag),
+      //                 dev_ctx.template Alloc<T>(Out),
+      //                 out_batch_size,
+      //                 M * K,
+      //                 0);
     }
   } else if (!is_broadcast_dims) {
     VLOG(3) << "MatMul's case 13";
-    blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
-                     trans_y ? CblasTrans : CblasNoTrans,
-                     M,
-                     N,
-                     K,
-                     static_cast<T>(1),
-                     x_data,
-                     y_data,
-                     static_cast<T>(flag),
-                     dev_ctx.template Alloc<T>(Out),
-                     out_batch_size,
-                     M * K,
-                     K * N);
+    //blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
+    //                 trans_y ? CblasTrans : CblasNoTrans,
+    //                 M,
+    //                 N,
+    //                 K,
+    //                 static_cast<T>(1),
+    //                 x_data,
+    //                 y_data,
+    //                 static_cast<T>(flag),
+    //                 dev_ctx.template Alloc<T>(Out),
+    //                 out_batch_size,
+    //                 M * K,
+    //                 K * N);
   } else {
     // in the case, can't use stridedgemm
     std::vector<const T*> x_ptr(out_batch_size);
@@ -467,17 +467,17 @@ void MatMulFunctionImplWithBlas(
       IndexIncreaseFromDims(batch_dim, out_broadcast_dims.data(), index.data());
     }
     VLOG(3) << "MatMul's case 14";
-    blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
-                     trans_y ? CblasTrans : CblasNoTrans,
-                     M,
-                     N,
-                     K,
-                     static_cast<T>(1),
-                     x_ptr.data(),
-                     y_ptr.data(),
-                     static_cast<T>(flag),
-                     out_ptr.data(),
-                     out_batch_size);
+    //blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
+    //                 trans_y ? CblasTrans : CblasNoTrans,
+    //                 M,
+    //                 N,
+    //                 K,
+    //                 static_cast<T>(1),
+    //                 x_ptr.data(),
+    //                 y_ptr.data(),
+    //                 static_cast<T>(flag),
+    //                 out_ptr.data(),
+    //                 out_batch_size);
   }
 }
 
@@ -899,8 +899,8 @@ struct MatMulDispatcher {
                   bool trans_x,
                   bool trans_y,
                   bool flag = false) {
-    MatMulFunctionImplWithBlas<Context, T>(
-        ctx, x, y, x_dims, y_dims, out, trans_x, trans_y, flag);
+    //MatMulFunctionImplWithBlas<Context, T>(
+    //    ctx, x, y, x_dims, y_dims, out, trans_x, trans_y, flag);
   }
 };
 
@@ -983,8 +983,8 @@ void MatmulKernel(const Context& ctx,
                                    " but reviced dims size is 0. "));
   const std::vector<std::int64_t> x_dims = vectorize(x.dims());
   const std::vector<std::int64_t> y_dims = vectorize(y.dims());
-  MatMulFunction<Context, T>(
-      ctx, x, y, x_dims, y_dims, out, transpose_x, transpose_y);
+  //MatMulFunction<Context, T>(
+  //    ctx, x, y, x_dims, y_dims, out, transpose_x, transpose_y);
 }
 
 template <typename T, typename Context>
@@ -1007,7 +1007,7 @@ void MatmulWithFlattenKernel(const Context& dev_ctx,
 
   auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
 
-  blas.MatMul(x_matrix, y_matrix, out);
+  //blas.MatMul(x_matrix, y_matrix, out);
   if (z_dim.size() != 2) {
     out->Resize(z_dim);
   }
diff --git a/paddle/phi/kernels/reduce_sum_kernel.cc b/paddle/phi/kernels/reduce_sum_kernel.cc
index 184522e4fd2b8..59d192014da1d 100644
--- a/paddle/phi/kernels/reduce_sum_kernel.cc
+++ b/paddle/phi/kernels/reduce_sum_kernel.cc
@@ -53,7 +53,7 @@ PD_REGISTER_KERNEL(sum,
   kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(sum,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.cc b/paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.cc
index 48ff1e42ba706..0ea7fbe8857c4 100644
--- a/paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.cc
@@ -84,7 +84,7 @@ PD_REGISTER_KERNEL(multiply_sr,
                    complex64,
                    complex128) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(multiply_raw_sr,
                    GPU,
                    ALL_LAYOUT,

From 6b5f224e9924ef7dfba827bb5cfe83ec3dc23bb7 Mon Sep 17 00:00:00 2001
From: CaiZhi <zhi.cai@mthreads.com>
Date: Mon, 31 Jul 2023 22:07:17 +0800
Subject: [PATCH 13/55] [MTAI] feat(build): enable add op demo for MUSA

---
 .../memory/allocation/allocator_facade.cc     |  4 +-
 paddle/fluid/platform/init.cc                 |  2 +-
 paddle/fluid/pybind/pybind.cc                 |  6 +--
 paddle/phi/backends/gpu/gpu_resources.cc      | 28 ++++++-------
 paddle/phi/kernels/CMakeLists.txt             |  1 +
 .../phi/kernels/kps/elementwise_add_kernel.cu | 39 ++++++++++++++++++-
 6 files changed, 59 insertions(+), 21 deletions(-)

diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 41635de256abe..64b0602ab6fc6 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -655,7 +655,7 @@ class AllocatorFacadePrivate {
     auto chunk_size = FLAGS_auto_growth_chunk_size_in_mb << 20;
     VLOG(4) << "FLAGS_auto_growth_chunk_size_in_mb is "
             << FLAGS_auto_growth_chunk_size_in_mb;
-#if defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_HIP) ||  defined(PADDLE_WITH_MUSA)
     auto cuda_allocator = CreateCUDAAllocator(p);
     cuda_allocators_[p][stream] = std::make_shared<AutoGrowthBestFitAllocator>(
         cuda_allocator,
@@ -741,7 +741,7 @@ class AllocatorFacadePrivate {
     auto chunk_size = FLAGS_auto_growth_chunk_size_in_mb << 20;
     VLOG(4) << "FLAGS_auto_growth_chunk_size_in_mb is "
             << FLAGS_auto_growth_chunk_size_in_mb;
-#if defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     auto cuda_allocator = CreateCUDAAllocator(p);
     allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
         cuda_allocator,
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index fad64a6290486..4afa964d5f823 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -441,7 +441,7 @@ void InitMemoryMethod() {
     memory_method->allocation_deleter =
         paddle::memory::allocation::Allocator::AllocationDeleter;
 #if defined(PADDLE_WITH_CUSTOM_DEVICE) || defined(PADDLE_WITH_CUDA) || \
-    defined(PADDLE_WITH_HIP)
+    defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     memory_method->copy_with_stream =
         paddle::memory::Copy<phi::Place, phi::Place>;
 #endif
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 8a03362cdd2f3..5dc31e6231da4 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -231,7 +231,7 @@ bool IsCompiledWithAVX() {
 }
 
 bool IsCompiledWithCUDA() {
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
   return false;
 #else
   return true;
@@ -1563,7 +1563,7 @@ All parameter, weight, gradient are variables in Paddle.
           "create",
           [](paddle::platform::CUDAPlace &place)
               -> paddle::platform::DeviceContext * {
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
             PADDLE_THROW(platform::errors::PermissionDenied(
                 "Cannot use CUDAPlace in CPU only version, "
                 "Please recompile or reinstall Paddle with CUDA support."));
@@ -1597,7 +1597,7 @@ All parameter, weight, gradient are variables in Paddle.
           "create",
           [](paddle::platform::CUDAPinnedPlace &place)
               -> paddle::platform::DeviceContext * {
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
             PADDLE_THROW(platform::errors::PermissionDenied(
                 "Cannot use CUDAPinnedPlace in CPU only version, "
                 "Please recompile or reinstall Paddle with CUDA support."));
diff --git a/paddle/phi/backends/gpu/gpu_resources.cc b/paddle/phi/backends/gpu/gpu_resources.cc
index e13d318942e06..967fb5fade5b5 100644
--- a/paddle/phi/backends/gpu/gpu_resources.cc
+++ b/paddle/phi/backends/gpu/gpu_resources.cc
@@ -168,20 +168,20 @@ void InitGpuProperties(Place place,
   auto compile_musa_version =
       (MUSA_VERSION / 1000) * 10 + (MUSA_VERSION % 100) / 10;
 #if defined(__linux__)
-  PADDLE_ENFORCE_EQ(
-      (local_musa_version / 10 < compile_musa_version / 10) &&
-          (mudnn_dso_ver / 1000 < MUDNN_VERSION / 1000),
-      false,
-      phi::errors::InvalidArgument(
-          "The installed Paddle is compiled with MUDA%d/muDNN%d,"
-          "but MUSA/muDNN version in your machine is MUSA%d/muDNN%d. "
-          "which will cause serious incompatible bug. "
-          "Please recompile or reinstall Paddle with compatible MUSA/muDNN "
-          "version.",
-          compile_musa_version / 10,
-          MUDNN_VERSION / 1000,
-          local_musa_version / 10,
-          mudnn_dso_ver / 1000));
+  //PADDLE_ENFORCE_EQ(
+  //    (local_musa_version / 10 < compile_musa_version / 10) &&
+  //        (mudnn_dso_ver / 1000 < MUDNN_VERSION / 1000),
+  //    false,
+  //    phi::errors::InvalidArgument(
+  //        "The installed Paddle is compiled with MUDA%d/muDNN%d,"
+  //        "but MUSA/muDNN version in your machine is MUSA%d/muDNN%d. "
+  //        "which will cause serious incompatible bug. "
+  //        "Please recompile or reinstall Paddle with compatible MUSA/muDNN "
+  //        "version.",
+  //        compile_musa_version / 10,
+  //        MUDNN_VERSION / 1000,
+  //        local_musa_version / 10,
+  //        mudnn_dso_ver / 1000));
 #endif
   if (local_musa_version < compile_musa_version) {
     LOG_FIRST_N(WARNING, 1)
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index 9928c46563b0f..802f018150575 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -45,6 +45,7 @@ file(
   "gpu/s*.cu.cc"
   "gpu/c*.cu"
   "gpu/s*.cu"
+  "gpu/abs_kernel.cu"
   "gpu/full_kernel.cu"
   "gpu/matmul_kernel.cu"
   "gpu/expand_kernel.cu"
diff --git a/paddle/phi/kernels/kps/elementwise_add_kernel.cu b/paddle/phi/kernels/kps/elementwise_add_kernel.cu
index b3fe46a1cd310..45b8466aa0762 100644
--- a/paddle/phi/kernels/kps/elementwise_add_kernel.cu
+++ b/paddle/phi/kernels/kps/elementwise_add_kernel.cu
@@ -21,6 +21,8 @@
 #include "paddle/phi/kernels/elementwise_add_kernel.h"
 #include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
 
+#include <mudnn.h>
+
 namespace phi {
 
 template <typename T, typename Context>
@@ -67,6 +69,30 @@ void Float32Bfloat16OrFloat16AddCudaFunctor(const Context& dev_ctx,
   }
 }
 
+// TODO(MTAI): The following code is temporary, which is just a demo for MUSA.
+// It will be removed later.
+using muTensor = ::musa::dnn::Tensor;
+using BINARY_MODE = ::musa::dnn::Binary::Mode;
+muTensor CreateMUTensor(const DenseTensor& tensor) {
+  muTensor mu_tensor;
+  switch (tensor.dtype()) {
+    case DataType::FLOAT32:
+      mu_tensor.SetType(muTensor::Type::FLOAT);
+      break;
+    case DataType::INT32:
+      mu_tensor.SetType(muTensor::Type::INT32);
+      break;
+    case DataType::INT64:
+      mu_tensor.SetType(muTensor::Type::INT64);
+      break;
+    default:
+      std::cerr << "=========mismatch dtype in add kernel=====\n";
+      throw;
+  }
+  mu_tensor.SetAddr(tensor.data());
+  return mu_tensor;
+}
+
 template <typename T, typename Context>
 void AddKernel(const Context& dev_ctx,
                const DenseTensor& x,
@@ -80,7 +106,18 @@ void AddKernel(const Context& dev_ctx,
     Float32Bfloat16OrFloat16AddCudaFunctor<Type, Context>(dev_ctx, x, y, out);
   } else {
 #endif
-    AddCudaFunctor<T, Context>(dev_ctx, x, y, -1, out);
+    // AddCudaFunctor<T, Context>(dev_ctx, x, y, -1, out);
+  dev_ctx.template Alloc<T>(out);
+  using muHandle = ::musa::dnn::Handle;
+  ::musa::dnn::Handle h;
+  muTensor musa_self = CreateMUTensor(x);
+  muTensor musa_other = CreateMUTensor(y);
+  muTensor musa_out = CreateMUTensor(*out);
+
+  ::musa::dnn::Binary binary_op;
+  binary_op.SetMode(BINARY_MODE::ADD);
+  binary_op.Run(h, musa_out, musa_self, musa_other);
+
 #ifdef PADDLE_WITH_CUDA
   }
 #endif

From 1b62b455ec9e224c06974a406fcdae58ba931caf Mon Sep 17 00:00:00 2001
From: "zhi.cai" <zhi.cai@mthreads.com>
Date: Tue, 1 Aug 2023 10:36:36 +0800
Subject: [PATCH 14/55] [MTAI] feat(build): change submodule eigen git link

---
 .gitmodules    | 3 ++-
 CMakeLists.txt | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index 3ea3f4a0903a0..eb37796f51f0a 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -52,7 +52,8 @@
 	ignore = dirty
 [submodule "third_party/eigen3"]
 	path = third_party/eigen3
-	url = https://gitlab.com/libeigen/eigen.git
+	url = https://gitlab.com/paipinuo233/eigen.git
+	branch = support_musa
 	ignore = dirty
 [submodule "third_party/snappy"]
 	path = third_party/snappy
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2f05a7eb080fa..c76306c5aa826 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -91,7 +91,7 @@ if(WITH_GPU AND WITH_ROCM)
   message(FATAL_ERROR "Error when compile CUDA and ROCM at the same time")
 endif()
 if(WITH_GPU AND WITH_MUSA)
-	message(FATAL_ERROR "Error when compile CUDA and MUSA at the same time")
+  message(FATAL_ERROR "Error when compile CUDA and MUSA at the same time")
 endif()
 
 if(WITH_GPU AND NOT APPLE)

From c1ef0731c9693fe84525b4aa796ad0f99581a466 Mon Sep 17 00:00:00 2001
From: "zhi.cai" <zhi.cai@mthreads.com>
Date: Tue, 1 Aug 2023 16:33:00 +0800
Subject: [PATCH 15/55] [MTAI] feat(build): add TODO comments for temporary
 solutions

---
 CMakeLists.txt                                |   1 -
 cmake/configure.cmake                         |   2 -
 cmake/flags.cmake                             |   4 +
 cmake/generic.cmake                           |   3 +-
 cmake/mudnn.cmake                             |   1 +
 cmake/musa.cmake                              |   2 +
 paddle/phi/CMakeLists.txt                     |   1 +
 paddle/phi/api/include/tensor.h               |   4 +-
 paddle/phi/backends/device_code.cc            | 150 +++++++++---------
 paddle/phi/backends/dynload/CMakeLists.txt    |   2 +-
 paddle/phi/backends/dynload/dynamic_loader.cc |   5 +-
 paddle/phi/backends/gpu/forwards.h            |   1 +
 paddle/phi/backends/gpu/gpu_context.cc        |   2 +-
 paddle/phi/backends/gpu/gpu_decls.h           |  60 ++++---
 paddle/phi/backends/gpu/gpu_resources.cc      |  10 +-
 paddle/phi/backends/gpu/gpu_types.h           |  27 ++--
 .../phi/backends/gpu/musa/.musa_info.cc.swp   | Bin 4096 -> 0 bytes
 .../backends/gpu/rocm/rocm_device_function.h  |   2 -
 paddle/phi/common/.float16.h.swp              | Bin 16384 -> 0 bytes
 paddle/phi/kernels/CMakeLists.txt             |  40 +++--
 paddle/phi/kernels/funcs/.im2col.cu.swp       | Bin 16384 -> 0 bytes
 .../phi/kernels/funcs/eigen/.extensions.h.swp | Bin 16384 -> 0 bytes
 paddle/phi/kernels/funcs/eigen/.slice.cu.swp  | Bin 12288 -> 0 bytes
 paddle/phi/kernels/gpu/.auc_kernel.cu.swp     | Bin 4096 -> 0 bytes
 24 files changed, 166 insertions(+), 151 deletions(-)
 delete mode 100644 paddle/phi/backends/gpu/musa/.musa_info.cc.swp
 delete mode 100644 paddle/phi/common/.float16.h.swp
 delete mode 100644 paddle/phi/kernels/funcs/.im2col.cu.swp
 delete mode 100644 paddle/phi/kernels/funcs/eigen/.extensions.h.swp
 delete mode 100644 paddle/phi/kernels/funcs/eigen/.slice.cu.swp
 delete mode 100644 paddle/phi/kernels/gpu/.auc_kernel.cu.swp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c76306c5aa826..dfbe22ea13911 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -763,7 +763,6 @@ if(WITH_CPP_DIST)
   endif()
 endif()
 
-include_directories(/usr/lib/llvm-11/include/openmp/)
 add_subdirectory(paddle)
 if(WITH_PYTHON)
   add_subdirectory(python)
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index c1db56de7f728..2c769a83ef496 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -177,10 +177,8 @@ elseif(WITH_ROCM)
   endif()
 elseif(WITH_MUSA)
   add_definitions(-DPADDLE_WITH_MUSA)
-  #add_definitions(-DEIGEN_USE_THREADS)
   add_definitions(-DEIGEN_USE_GPU)
   add_definitions(-DEIGEN_USE_MUSA)
-  list(APPEND DEPENDENT_INCLUDE_DIRS "/usr/local/musa/include/")
 else()
   add_definitions(-DHPPL_STUB_FUNC)
   list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 3e95ed25ce473..83b6de9e80218 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -137,15 +137,19 @@ endif()
 # Do not care if this flag is support for gcc.
 
 # https://github.com/PaddlePaddle/Paddle/issues/12773
+# TODO(@caizhi): enable -Werror 
 if(NOT WIN32)
   set(COMMON_FLAGS
       -fPIC
       -fno-omit-frame-pointer
+      -Wall
+      -Wextra
       -Wno-unused-parameter
       -Wno-unused-function
       -Wno-error=array-bounds #Warning in Eigen, gcc 12.2
       -Wno-error=ignored-attributes # Warnings in Eigen, gcc 6.3
       -Wno-error=int-in-bool-context # Warning in Eigen gcc 7.2
+      -Wimplicit-fallthrough=0 # Warning in tinyformat.h
       ${fsanitize})
 
   if(WITH_IPU)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 4a255c0902206..57b6cf39f095a 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -425,6 +425,7 @@ function(cc_binary TARGET_NAME)
   if(WITH_ROCM)
     target_link_libraries(${TARGET_NAME} ${ROCM_HIPRTC_LIB})
   endif()
+  # TODO(@caizhi): enable target_link_libraries for MUSA
   #if(WITH_MUSA)
   #  target_link_libraries(${TARGET_NAME} ${MUSA_LIB})
   #endif()
@@ -838,7 +839,7 @@ function(musa_binary TARGET_NAME)
   endif()
 endfunction()
 
-# TODO(MTAI): enable musa_test
+# TODO(@caizhi): enable musa_test
 #function(musa_test TARGET_NAME)
 #endfunction()
 
diff --git a/cmake/mudnn.cmake b/cmake/mudnn.cmake
index 80c74c9131c21..4485139f7d01c 100644
--- a/cmake/mudnn.cmake
+++ b/cmake/mudnn.cmake
@@ -54,6 +54,7 @@ else()
   set(MUDNN_FOUND OFF)
 endif()
 
+# TODO(@caizhi): enable mudnn finding
 #macro(find_cudnn_version cudnn_header_file)
 #endmacro()
 
diff --git a/cmake/musa.cmake b/cmake/musa.cmake
index 1349c5afead4b..fddc1855e87ac 100644
--- a/cmake/musa.cmake
+++ b/cmake/musa.cmake
@@ -15,7 +15,9 @@ set(CMAKE_MODULE_PATH "${MUSA_PATH}/cmake" ${CMAKE_MODULE_PATH})
 
 find_package(MUSA REQUIRED)
 include_directories(${MUSA_PATH}/include)
+include_directories(/usr/lib/llvm-11/include/openmp/)
 
+# TODO(@caizhi): enable finding musa version
 #macro(find_musa_version version_file)
 #endmacro()
 #find_musa_version(${MUSA_PATH}/version.h)
diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt
index 139642f5b6b65..0e9a77d91db87 100644
--- a/paddle/phi/CMakeLists.txt
+++ b/paddle/phi/CMakeLists.txt
@@ -51,6 +51,7 @@ if(WITH_GPU)
   list(APPEND PHI_DEPS external_error_proto)
 endif()
 
+# TODO(@caizhi): optimize me instead of hard code
 if(WITH_MUSA)
   set(DEPENDENT_LIBRARIES "")
   list(APPEND DEPENDENT_LIBRARIES "/usr/local/musa/lib/libmudnn.so")
diff --git a/paddle/phi/api/include/tensor.h b/paddle/phi/api/include/tensor.h
index ab7c298288d9d..b2c687a1f448d 100644
--- a/paddle/phi/api/include/tensor.h
+++ b/paddle/phi/api/include/tensor.h
@@ -25,8 +25,8 @@ using gpuStream_t = cudaStream_t;
 #endif
 
 #ifdef PADDLE_WITH_HIP
-//#include <hip/hip_runtime.h>
-//using gpuStream_t = hipStream_t;
+#include <hip/hip_runtime.h>
+using gpuStream_t = hipStream_t;
 #endif
 
 #ifdef PADDLE_WITH_MUSA
diff --git a/paddle/phi/backends/device_code.cc b/paddle/phi/backends/device_code.cc
index 97279e2d0f76c..7974d2850cc30 100644
--- a/paddle/phi/backends/device_code.cc
+++ b/paddle/phi/backends/device_code.cc
@@ -80,7 +80,7 @@ DeviceCodePool::DeviceCodePool(const std::vector<phi::Place>& places) {
   }
   for (auto& p : set) {
     if (p.GetType() == phi::AllocationType::GPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       device_codes_.emplace(p, DeviceCodeMap());
 #else
       PADDLE_THROW(phi::errors::PreconditionNotMet(
@@ -90,40 +90,41 @@ DeviceCodePool::DeviceCodePool(const std::vector<phi::Place>& places) {
     }
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   GPUDeviceCode::CheckAvailableStatus();
 #endif
 }
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
-//#ifdef PADDLE_WITH_HIP
-//static bool CheckCUDADriverResult(hipError_t result,
-//                                  std::string caller,
-//                                  std::string kernel_name = "") {
-//  if (result != hipSuccess) {
-//    const char* error = nullptr;
-//    error = dynload::hipGetErrorString(result);
-//#elif defined(PADDLE_WITH_MUSA)
-////static bool CheckCUDADriverResult(MUresult result,
-////                                  std::string caller,
-////                                  std::string kernel_name = "") {
-////  if (result != MUSA_SUCCESS) {
-////    const char* error = nullptr;
-////    muGetErrorString(result, &error);
-//#else
-//static bool CheckCUDADriverResult(CUresult result,
-//                                  std::string caller,
-//                                  std::string kernel_name = "") {
-//  if (result != CUDA_SUCCESS) {
-//    const char* error = nullptr;
-//    dynload::cuGetErrorString(result, &error);
-//#endif
-//    LOG_FIRST_N(WARNING, 1) << "Call " << caller << " for < " << kernel_name
-//                            << " > failed: " << error << " (" << result << ")";
-//    return false;
-//  }
-//  return true;
-//}
+#ifdef PADDLE_WITH_HIP
+static bool CheckCUDADriverResult(hipError_t result,
+                                  std::string caller,
+                                  std::string kernel_name = "") {
+  if (result != hipSuccess) {
+    const char* error = nullptr;
+    error = dynload::hipGetErrorString(result);
+#elif defined(PADDLE_WITH_MUSA)
+static bool CheckCUDADriverResult(MUresult result,
+                                  std::string caller,
+                                  std::string kernel_name = "") {
+  if (result != MUSA_SUCCESS) {
+    const char* error = nullptr;
+    // TODO(@caizhi): enable dynload module
+    // dynload::muGetErrorString(result, &error);
+#else
+static bool CheckCUDADriverResult(CUresult result,
+                                  std::string caller,
+                                  std::string kernel_name = "") {
+  if (result != CUDA_SUCCESS) {
+    const char* error = nullptr;
+    dynload::cuGetErrorString(result, &error);
+#endif
+    LOG_FIRST_N(WARNING, 1) << "Call " << caller << " for < " << kernel_name
+                            << " > failed: " << error << " (" << result << ")";
+    return false;
+  }
+  return true;
+}
 
 bool GPUDeviceCode::available_ = false;
 void GPUDeviceCode::CheckAvailableStatus() {
@@ -140,7 +141,7 @@ void GPUDeviceCode::CheckAvailableStatus() {
   hiprtcResult nvrtc_result =
       dynload::hiprtcVersion(&nvrtc_major, &nvrtc_minor);
 #elif defined(PADDLE_WITH_MUSA)
-
+  // TODO(@caizhi): enable dynload module
 #else
   nvrtcResult nvrtc_result = dynload::nvrtcVersion(&nvrtc_major, &nvrtc_minor);
 #endif
@@ -148,19 +149,21 @@ void GPUDeviceCode::CheckAvailableStatus() {
   int driver_version = 0;
   int dirver_major = 0;
   int driver_minor = 0;
-//#ifdef PADDLE_WITH_HIP
-//  hipError_t driver_result = dynload::hipDriverGetVersion(&driver_version);
-//  if (driver_result == hipSuccess) {
-//#elif defined(PADDLE_WITH_MUSA)
-//  MUresult driver_result = muDriverGetVersion(&driver_version);
-//  if (driver_result == MUSA_SUCCESS) {
-//#else
-//  CUresult driver_result = dynload::cuDriverGetVersion(&driver_version);
-//  if (driver_result == CUDA_SUCCESS) {
-//#endif
-//    dirver_major = driver_version / 1000;
-//    driver_minor = (driver_version % 1000) / 10;
-//  }
+#ifdef PADDLE_WITH_HIP
+  hipError_t driver_result = dynload::hipDriverGetVersion(&driver_version);
+  if (driver_result == hipSuccess) {
+#elif defined(PADDLE_WITH_MUSA)
+  // TODO(@caizhi): enable dynload module
+  // MUresult driver_result = dynload::muDriverGetVersion(&driver_version);
+  // if (driver_result == MUSA_SUCCESS) {
+  if (true) {
+#else
+  CUresult driver_result = dynload::cuDriverGetVersion(&driver_version);
+  if (driver_result == CUDA_SUCCESS) {
+#endif
+    dirver_major = driver_version / 1000;
+    driver_minor = (driver_version % 1000) / 10;
+  }
 
   LOG_FIRST_N(INFO, 1) << "CUDA Driver Version: " << dirver_major << "."
                        << driver_minor << "; NVRTC Version: " << nvrtc_major
@@ -168,6 +171,7 @@ void GPUDeviceCode::CheckAvailableStatus() {
 #ifdef PADDLE_WITH_HIP
   if (nvrtc_result != HIPRTC_SUCCESS || driver_result != hipSuccess) {
 #elif defined(PADDLE_WITH_MUSA)
+  // TODO(@caizhi): enable dynload module
   if (false) {
 #else
   if (nvrtc_result != NVRTC_SUCCESS || driver_result != CUDA_SUCCESS) {
@@ -176,18 +180,20 @@ void GPUDeviceCode::CheckAvailableStatus() {
   }
 
   int count = 0;
-//#ifdef PADDLE_WITH_HIP
-//  if (CheckCUDADriverResult(dynload::hipGetDeviceCount(&count),
-//                            "hipGetDeviceCount")) {
-//#elif defined(PADDLE_WITH_MUSA)
-//  if (CheckCUDADriverResult(muDeviceGetCount(&count),
-//                            "muDeviceGetCount")) {
-//#else
-//  if (CheckCUDADriverResult(dynload::cuDeviceGetCount(&count),
-//                            "cuDeviceGetCount")) {
-//#endif
-//    available_ = true;
-//  }
+#ifdef PADDLE_WITH_HIP
+  if (CheckCUDADriverResult(dynload::hipGetDeviceCount(&count),
+                            "hipGetDeviceCount")) {
+#elif defined(PADDLE_WITH_MUSA)
+  // TODO(@caizhi): enable dynload module
+  // if (CheckCUDADriverResult(dynload::muDeviceGetCount(&count),
+  //                           "muDeviceGetCount")) {
+  if (true) {
+#else
+  if (CheckCUDADriverResult(dynload::cuDeviceGetCount(&count),
+                            "cuDeviceGetCount")) {
+#endif
+    available_ = true;
+  }
 }
 
 static std::string FindCUDAIncludePath() {
@@ -342,6 +348,7 @@ bool GPUDeviceCode::Compile(bool include_path) {
     return false;
   }
 #elif defined(PADDLE_WITH_MUSA)
+  // TODO(@caizhi): enable dynload module
   auto* dev_ctx = reinterpret_cast<phi::GPUContext*>(
       DeviceContextPool::Instance().Get(place_));
   is_compiled_ = false;
@@ -465,21 +472,22 @@ void GPUDeviceCode::Launch(const size_t n, std::vector<void*>* args) const {
       errors::External("Fail to launch kernel %s (in hipModuleLaunchKernel.)",
                        name_.c_str()));
 #elif defined(PADDLE_WITH_MUSA)
-  //PADDLE_ENFORCE_EQ(
-  //    muLaunchKernel(function_,
-  //                            num_blocks,
-  //                            1,
-  //                            1,  // grid dim
-  //                            num_threads_,
-  //                            1,
-  //                            1,                  // block dim
-  //                            0,                  // shared memory
-  //                            dev_ctx->stream(),  // stream
-  //                            args->data(),       // arguments
-  //                            nullptr),
-  //    MUSA_SUCCESS,
-  //    errors::External("Fail to launch kernel %s (in muLaunchKernel.)",
-  //                     name_.c_str()));
+  // TODO(@caizhi): enable dynload module
+  // PADDLE_ENFORCE_EQ(
+  //     dynload::muLaunchKernel(function_,
+  //                             num_blocks,
+  //                             1,
+  //                             1,  // grid dim
+  //                             num_threads_,
+  //                             1,
+  //                             1,                  // block dim
+  //                             0,                  // shared memory
+  //                             dev_ctx->stream(),  // stream
+  //                             args->data(),       // arguments
+  //                             nullptr),
+  //     MUSA_SUCCESS,
+  //     errors::External("Fail to launch kernel %s (in muLaunchKernel.)",
+  //                      name_.c_str()));
 #else
   PADDLE_ENFORCE_EQ(
       dynload::cuLaunchKernel(function_,
diff --git a/paddle/phi/backends/dynload/CMakeLists.txt b/paddle/phi/backends/dynload/CMakeLists.txt
index 883e95c41985f..c55d34a077276 100644
--- a/paddle/phi/backends/dynload/CMakeLists.txt
+++ b/paddle/phi/backends/dynload/CMakeLists.txt
@@ -54,7 +54,7 @@ if(NOT APPLE)
     endif()
   endif()
   if(WITH_MUSA)
-	  list(APPEND MUSA_SRCS musartc.cc musa_driver.cc)
+    list(APPEND MUSA_SRCS musartc.cc musa_driver.cc)
   endif()
 endif()
 
diff --git a/paddle/phi/backends/dynload/dynamic_loader.cc b/paddle/phi/backends/dynload/dynamic_loader.cc
index fd6d3ef9e0097..ac669381c0415 100644
--- a/paddle/phi/backends/dynload/dynamic_loader.cc
+++ b/paddle/phi/backends/dynload/dynamic_loader.cc
@@ -508,9 +508,8 @@ void* GetNCCLDsoHandle() {
       "Installation-Guide.html before install PaddlePaddle.");
 #elif defined(PADDLE_WITH_MUSA)
   std::string warning_msg(
-      "You may need to install 'mccl' from MUSA official website: "
-      "https://rocmdocs.amd.com/en/latest/Installation_Guide/"
-      "Installation-Guide.html before install PaddlePaddle.");
+      "You may need to install 'mccl' from MUSA official website"
+      " before install PaddlePaddle.");
 #else
   std::string warning_msg(
       "You may need to install 'nccl2' from NVIDIA official website: "
diff --git a/paddle/phi/backends/gpu/forwards.h b/paddle/phi/backends/gpu/forwards.h
index 56a590793046c..244de0a9dabf4 100644
--- a/paddle/phi/backends/gpu/forwards.h
+++ b/paddle/phi/backends/gpu/forwards.h
@@ -77,6 +77,7 @@ using musaStream_t = struct MUstream_st *;
 using musaEvent_t = struct MUevent_st *;
 using mublasHandle_t = struct _mublasHandle_t*;
 using mudnnHandle_t = class Handle*;
+// TODO(@caizhi): using correct type
 using musolverDnHandle_t = bool**;
 using mublasLtHandle_t = struct _mublasHandle_t*;
 using musparseHandle_t = bool**;
diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc
index 2bcf665b8fd61..6fa9cb6a21460 100644
--- a/paddle/phi/backends/gpu/gpu_context.cc
+++ b/paddle/phi/backends/gpu/gpu_context.cc
@@ -47,7 +47,7 @@ limitations under the License. */
 #include "paddle/phi/backends/dynload/mublas.h"
 #include "paddle/phi/backends/dynload/mudnn.h"
 #include "paddle/phi/backends/dynload/musparse.h"
-#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
+#if !defined(__APPLE__) && defined(PADDLE_WITH_MCCL)
 #include "paddle/phi/backends/dynload/mccl.h"
 #endif  // !defined(__APPLE__) && defined(PADDLE_WITH_MCCL)
 #endif  // PADDLE_WITH_MUSA
diff --git a/paddle/phi/backends/gpu/gpu_decls.h b/paddle/phi/backends/gpu/gpu_decls.h
index d0a9595a66b16..44e9ca2a464f7 100644
--- a/paddle/phi/backends/gpu/gpu_decls.h
+++ b/paddle/phi/backends/gpu/gpu_decls.h
@@ -24,11 +24,6 @@ namespace phi {
   using GPU_TYPE = ROCM_TYPE;
 
 #elif defined(PADDLE_WITH_MUSA)
-  //using mudnnHandle_t = ::musa::dnn::Handle;
-  //using mudnnHandle_t = bool**;
-  //using mublasLtHandle_t = bool**;
-  //using musparseHandle_t = bool**;
-  //using musolverDnHandle_t = bool**;
 #define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \
   using GPU_TYPE = MUSA_TYPE;
 
@@ -39,39 +34,38 @@ namespace phi {
 
 DECLARE_TYPE_FOR_GPU(gpuStream_t, cudaStream_t, hipStream_t, musaStream_t);
 DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t, musaEvent_t);
-#if 0
-DECLARE_TYPE_FOR_GPU(dnnActivationDescriptor,
-                     cudnnActivationStruct,
-                     miopenActivationDescriptor);
-DECLARE_TYPE_FOR_GPU(dnnTensorDescriptor,
-                     cudnnTensorStruct,
-                     miopenTensorDescriptor);
-DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor,
-                     cudnnFilterStruct,
-                     miopenTensorDescriptor);
-DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor_t,
-                     cudnnFilterDescriptor_t,
-                     miopenTensorDescriptor_t);
-DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor,
-                     cudnnConvolutionStruct,
-                     miopenConvolutionDescriptor);
-DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor_t,
-                     cudnnConvolutionDescriptor_t,
-                     miopenConvolutionDescriptor_t);
-DECLARE_TYPE_FOR_GPU(dnnPoolingDescriptor_t,
-                     cudnnPoolingDescriptor_t,
-                     miopenPoolingDescriptor_t);
-DECLARE_TYPE_FOR_GPU(dnnDropoutDescriptor_t,
-                     cudnnDropoutDescriptor_t,
-                     miopenDropoutDescriptor_t);
-#endif
+
+// DECLARE_TYPE_FOR_GPU(dnnActivationDescriptor,
+//                      cudnnActivationStruct,
+//                      miopenActivationDescriptor);
+// DECLARE_TYPE_FOR_GPU(dnnTensorDescriptor,
+//                      cudnnTensorStruct,
+//                      miopenTensorDescriptor);
+// DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor,
+//                      cudnnFilterStruct,
+//                      miopenTensorDescriptor);
+// DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor_t,
+//                      cudnnFilterDescriptor_t,
+//                      miopenTensorDescriptor_t);
+// DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor,
+//                      cudnnConvolutionStruct,
+//                      miopenConvolutionDescriptor);
+// DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor_t,
+//                      cudnnConvolutionDescriptor_t,
+//                      miopenConvolutionDescriptor_t);
+// DECLARE_TYPE_FOR_GPU(dnnPoolingDescriptor_t,
+//                      cudnnPoolingDescriptor_t,
+//                      miopenPoolingDescriptor_t);
+// DECLARE_TYPE_FOR_GPU(dnnDropoutDescriptor_t,
+//                      cudnnDropoutDescriptor_t,
+//                      miopenDropoutDescriptor_t);
+
 
 // TODO(Ming Huang): Since there is no blasLt handler,
 // use rocblas_handle for workround.
+// TODO(@caizhi): using correct type: musolverDnHandle_t, musparseHandle_t
 DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle, mublasLtHandle_t);
-
 DECLARE_TYPE_FOR_GPU(solverHandle_t, cusolverDnHandle_t, rocsolver_handle, musolverDnHandle_t);
-
 DECLARE_TYPE_FOR_GPU(sparseHandle_t, cusparseHandle_t, rocsparse_handle, musparseHandle_t);
 DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t, mudnnHandle_t);
 DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle, mublasHandle_t);
diff --git a/paddle/phi/backends/gpu/gpu_resources.cc b/paddle/phi/backends/gpu/gpu_resources.cc
index 967fb5fade5b5..9846cbdb992c3 100644
--- a/paddle/phi/backends/gpu/gpu_resources.cc
+++ b/paddle/phi/backends/gpu/gpu_resources.cc
@@ -156,7 +156,8 @@ void InitGpuProperties(Place place,
            "version.";
   }
 #elif defined(PADDLE_WITH_MUSA)
-  //size_t mudnn_dso_ver = dynload::mudnnGetVersion();
+  // TODO(@caizhi): enable dynload module
+  // size_t mudnn_dso_ver = dynload::mudnnGetVersion();
   size_t mudnn_dso_ver = 0;
   LOG_FIRST_N(WARNING, 1) << "device: " << static_cast<int>(place.device)
                           << ", muDNN Version: " << mudnn_dso_ver / 1000 << "."
@@ -168,6 +169,7 @@ void InitGpuProperties(Place place,
   auto compile_musa_version =
       (MUSA_VERSION / 1000) * 10 + (MUSA_VERSION % 100) / 10;
 #if defined(__linux__)
+  // TODO(@caizhi): enable dynload module
   //PADDLE_ENFORCE_EQ(
   //    (local_musa_version / 10 < compile_musa_version / 10) &&
   //        (mudnn_dso_ver / 1000 < MUDNN_VERSION / 1000),
@@ -363,7 +365,11 @@ void DestroyDnnHandle(dnnHandle_t handle) {
     handle = nullptr;
   }
 #elif defined(PADDLE_WITH_MUSA)
-
+  if (handle != nullptr) {
+    // TODO(@caizhi): enable dynload module
+    // PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnDestroy(handle));
+    handle = nullptr;
+  }
 #else
   if (handle != nullptr) {
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDestroy(handle));
diff --git a/paddle/phi/backends/gpu/gpu_types.h b/paddle/phi/backends/gpu/gpu_types.h
index 21230d6b22701..31e32fa787d72 100644
--- a/paddle/phi/backends/gpu/gpu_types.h
+++ b/paddle/phi/backends/gpu/gpu_types.h
@@ -36,26 +36,28 @@ namespace phi {
 #ifdef PADDLE_WITH_HIP
 #define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \
   using GPU_TYPE = ROCM_TYPE;
+
 #elif defined(PADDLE_WITH_MUSA)
 #define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \
   using GPU_TYPE = MUSA_TYPE;
-#else  // PADDLE_WITH_CDUA
 
+#else  // PADDLE_WITH_MUSA
 #define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \
   using GPU_TYPE = CUDA_TYPE;
-#endif
+#endif  // PADDLE_WITH_CUDA
 
 DECLARE_TYPE_FOR_GPU(gpuError_t, cudaError_t, hipError_t, musaError_t);
 DECLARE_TYPE_FOR_GPU(gpuMemcpyKind, cudaMemcpyKind, hipMemcpyKind, musaMemcpyKind);
 DECLARE_TYPE_FOR_GPU(gpuDeviceProp, cudaDeviceProp, hipDeviceProp_t, musaDeviceProp);
-//DECLARE_TYPE_FOR_GPU(dnnDataType_t, cudnnDataType_t, miopenDataType_t);
-//DECLARE_TYPE_FOR_GPU(dnnPoolingMode_t, cudnnPoolingMode_t, miopenPoolingMode_t);
-//DECLARE_TYPE_FOR_GPU(dnnTensorFormat_t,
-//                     cudnnTensorFormat_t,
-//                     miopenTensorFormat_t);
-//DECLARE_TYPE_FOR_GPU(dnnActivationMode_t,
-//                     cudnnActivationMode_t,
-//                     miopenActivationMode_t);
+// TODO(@caizhi): 
+// DECLARE_TYPE_FOR_GPU(dnnDataType_t, cudnnDataType_t, miopenDataType_t);
+// DECLARE_TYPE_FOR_GPU(dnnPoolingMode_t, cudnnPoolingMode_t, miopenPoolingMode_t);
+// DECLARE_TYPE_FOR_GPU(dnnTensorFormat_t,
+//                      cudnnTensorFormat_t,
+//                      miopenTensorFormat_t);
+// DECLARE_TYPE_FOR_GPU(dnnActivationMode_t,
+//                      cudnnActivationMode_t,
+//                      miopenActivationMode_t);
 
 #undef DECLARE_TYPE_FOR_GPU
 
@@ -74,7 +76,10 @@ DECLARE_CONSTANT_FOR_GPU(gpuErrorOutOfMemory,
                          cudaErrorMemoryAllocation,
                          hipErrorOutOfMemory,
                          musaErrorMemoryAllocation);
-DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady, cudaErrorNotReady, hipErrorNotReady, musaErrorNotReady);
+DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady,
+                         cudaErrorNotReady,
+                         hipErrorNotReady,
+                         musaErrorNotReady);
 DECLARE_CONSTANT_FOR_GPU(gpuSuccess, cudaSuccess, hipSuccess, musaSuccess);
 
 DECLARE_CONSTANT_FOR_GPU(gpuMemcpyHostToDevice,
diff --git a/paddle/phi/backends/gpu/musa/.musa_info.cc.swp b/paddle/phi/backends/gpu/musa/.musa_info.cc.swp
deleted file mode 100644
index 6af992f38b98918be6e4adade228cbf08d2c333a..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4096
zcmYc?2=nw+u+TGP00IF9hP(?0Qg*p}Fr5ryU?|GZF98V?0*036W|rn@=7uR|iTKp&
zXXNLm>K7!Yq~xT==av>HLfQHS8JYS?iOJcic`3#E=>?_wU`Y@WpP84Iua}&R&)8A9
z(GVC70ir^Hm%-S`&;X=VSxHerSSXYzU89OdLtr!nMnhmU1V%$(Gz3ONU^E0qLtr!n
IhDQhh0Gzrb@&Et;

diff --git a/paddle/phi/backends/gpu/rocm/rocm_device_function.h b/paddle/phi/backends/gpu/rocm/rocm_device_function.h
index 0785ba2dd1cdb..6f5d684075f0f 100644
--- a/paddle/phi/backends/gpu/rocm/rocm_device_function.h
+++ b/paddle/phi/backends/gpu/rocm/rocm_device_function.h
@@ -132,8 +132,6 @@ __device__ T reduceSum(T val, int tid, int len) {
   // but most card's warp size is 32.
 #ifdef PADDLE_WITH_HIP
   const int warpSize = 64;
-#elif defined(PADDLE_WITH_MUSA)
-  const int warpSize = 32;
 #else
   const int warpSize = 32;
 #endif
diff --git a/paddle/phi/common/.float16.h.swp b/paddle/phi/common/.float16.h.swp
deleted file mode 100644
index 77f57398adbb1a6c5a54b0e5dc7285fccf261999..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 16384
zcmeI3U5q4E6~~K3Se9J`g+yNrmsuRU+1Z}%p4r~rc6PLxnPocc&aU&ZyF^UocGd0f
z!s@D~K6-k91tSj{NCYL(1mlAtMl>-C5`7{NeDK9+6vg0!A2HFy=!+UqK*0aGb!(=&
zr+Yru#Dt_K`LQ!q=bn4+x%ZxX&h4g_KUY1?j^zp(es0h-ckw0j?<ezn9{iG~d9EAK
z<L!P5$B&kdAFUrPm?gfI?0C~{iSaf!O<U-#(C6y+@pjW1H{4dsb;j$q%Y#BG*W5}`
z`Y!cA>VeDfz;)V0v5=>R2S;vUH-2#CGIXR(QV*maNIj5xAoW1%fz$)32T~8D9(b>M
zAZYK?9)j_E)Bx^R|6bAe@4M>fQT6<RzW48__l|nLS1E|~ysh4UPQ4HN-v39vUsLa2
z>wEu}df!#=9c6H=|4sG&arOSseed5;?_;|=S4M(yd-bQrRr`PRy;qOZe^L*m9!Ncq
zdLZ>c>Veb)sRvRIq#j5;ka{5Xz<bgI+|)Fxf%j3pjL-k$`v1l2HSKw@29AJh!Cvsn
z`!(%l@F+M5-n>rJo&#S7kAsK7=fO0{fDeF|4rto5;G5tPum*f^H&_7+pbU<K47dsG
z0at+E>_>ZW2|NM50`3D%a0=WBCc%fm4d8n4`)f7r$KVop8hjf(20B0o7r=RN5*!9M
zf&*YbczvIy{Rlh{9s^$lUjXyq4saBF6x<4adyS@j6?_WZ3a$ii?$xxvfxm)FzyqHK
zlVA_{!`1K?_&RtJJOVs01^#pu#t0U{Dewt!Gx#vL3H<wgn)Wa7DtHnE;5gU^t^)7i
z&f;zG6Yw;c1tZ{f%<XI7XW%Jt510lgfSh+)xd(-9)tM>kmLtrJuAeNFW^_HvxML=s
zo~_Kzo|@CU$4YpoX$(JIc7omJ3CjtD*Y-r98{7}3Li}AQ>A`K&nNd&p*^f(E_3aKl
zV0eahod)D>?g+hJC}lcEC!k;S`b04kwA$G*mQOaED0U~&6mO`hZ5Hm;&9E({M?2hx
zXzw7DvFEGH^ZG(%xq5C+KY6xL+OgeaLS%Q7+jZ5iuFF9b=}{)?mAsJI@Z5a$?2L5X
z0}muTcY0;1vi>+BJ+m@f(JPBH^W^N&QKsvOCv5rzo5<(0S?yqAa@4q_oE#9fyBiLk
zKeMzvJ9n-+Gsi5)#zZpFZQGV%1x$H9BmZS&H^WvxC>+zOqvb^~Y$^@c@dK8e(57_D
z4tk54uFv`?Yf`t2jgQ9$VLmM%i;QvG;y$ap9z0NQ7fO*-^t%*c4iBu3U~(?7&<xAB
zT!)!F;4D~c3)z-?EoQXCG4dB?gbi5&@yMkm{q9817h}&v4w7Qh-RI^Om#SwLXkL!z
z^LdCyUTfyMa`pLQ{n4f^q3dM2?(tP@3h9|>tI$*0Ce3X2@HDB}Jk@duV_jugVofE-
zMypG{9BAb*jU+r&<+j~nq&r>$G)6W1y)(1Eh&8N<8Cq#xXauh3Q*e1OUbR_lcuXe`
z?SM9<dOXV9w(vNlPekjwP{KxV2Mcd6kyO{SaEI&Mp;(_q7%BGz!}s8X$pV+P_-C<t
zqc6U(HurrNGzIemvs|`(J@BlK#cdV{Kfu`0co}c&u5G)kmeYv+0#$w+n*x%CO*&el
zRTCbohmH|oqezFwo0wslv36KPY$$7y%j2z5IaR%5Aydq*8+1R-%&HYM8O@d4eq*e%
zcsdrq4qo4bevX9Ev+J!fZWsb~xb;{PL!!x*M+&$7XyDlMYUKC^U>jS!W(!h6=MG{*
z1E&$=0kZ{86^8xJS@CdfEfB2chK|X-wK2BZw2UT6nDCs-M7@rXC!B!PtoWj5WOK7L
zHj;Au5R(uzp<HmZkIh)MY@5|EJ#etW17T{K!&|~{b3-swMw<H*Pw?GE8A+q~FMSk+
zbBfIR(#qMhXBL;yt6b<5=d*0%cZwq=#P>KJX$O@WwYQ_m=w=ZUO<~wH!5Qh4Oqb3C
zVI$rQZ^8qtomv<OUC%_#aI&Lebeh}}b%nG3e05=>n6SZe3_COhn`&C^aUkPyt~uS`
z)yCdcds^4k*|#yCbzSs*&mAcmbY>3FXN2ai^^w8R8H_j~txv@BqxmEFW}G?^53buC
z`@jpGz-kHUhRtUS#vQj!o+&bJQ*1_1kDW@67@|c=Dib!TA$t>QX#7Le^h%CIU$|9Y
zvh@VMg^SQ4I+lUKjKJ!3%WSJ<1wN*E{fPxpPw-aqnbG0_)<QxG>X7Y8W1@UwlM}60
z3lg5?L_*1BtLsZkb46WG=<AinF2_Nh5R(V{`rp{&3vM7Xp5=>{2-}5hxwi#vcHVI<
zlTlj15VGqCdjYn1uquSZ$dNMIGz_CqTwmk5uJ)kJbrE=3VqsbnQ39MiSoPETh0<it
zajMHixim?sUE<(|Wc9+al6-bI9NG4Gqs0;2453lXlJkH_h@j1AES@|MPGQ0vU&si%
zBq(wTrk=^vN1hpU0wDS^<kXFjHw0_A9pO22PT;6)Sw7`Vly8~T5iQR%E6ePBWpS~x
zuw0#6VrLfF%$bGR>N2t%{B@F57VctqRu^W+n7|nTTaZ#Bc}HCLnAJjdDNJdSX<@0z
zXk%_Rni9CA0L5{KjZ~Icb!mj%URkOxjmc>3*reJM_lA~-z+b~D(xwcS<5a+)R;gFs
z@I-WQS`J(FtN^1MW4>DtR&lIC4QZmEW)QT?<KwHVt2vJ7-4r?3Ym7r1`OzOgg*<s-
zY3^__msgGMazh-(Ys{?$9J)CcOB;V7^-uVk%X4K-i8?M)W1KNT6GN<9uu^ml(MDE_
zm4yQhSuLhDIzJOyq|4OoCH+;AcE<BM#?A>(X1TDDWvKm#YBYirJR0c?Z1k)~Ghi7b
z%ZmA8k)4%av5)@CDq+w>K9yq?oNJ3D=(9!P3$H^nNA>^BxBz$&wIJ31@#o*qq1JyG
z90PBm&VLqkz&&6Nyo<V?>i%cICGZpo!Fg~87zYQy3#j4052)^M0s%~L0p!4ba1GcC
zeu&!r2fzdd$bkdkTCfkijr#oO;9KA!Ks9^;Tmyc$54M1N!7O+KwfM{6yWnA9fZM@G
zz`Ll!Uj@&AM}Pq;;3jY*_#<lZ?}6`t4)`RX`g}il1$Fo@z&Ahx7~l}N8tef-MIHV%
z@C5h__!u|{{(?IEIdBPd!5pZ7X>bDk6aN1-_!amip!s+Kq(57F;G#SxH}W$aMVdy%
zEG{BK&tb^U%4M0fmCL;5XE?Ua=AtDj6{mqD9i{gJ-i|UZ>*tG9jYPHcfDTdV>2b#G
zM%M<o<;Zsn{pGVCAle%}#>Qrt!g0Npj^i^EmCn2F?WjP4Np>d2y+wyUQf{P}PD5Yd
z3;(A+5$$mg^uP3H>}rgbgIg{uSle!RY7#Q}+>u>)(4)dc7oO*O+jTH>g-q^hJfiJe
zwtM+2q*kx*e0-S}?`AJ|VD(*f#Zcm+>b=~~cICtjwpcVQY!$UXqWxcUT~yzev*sW-
z(Q5*CR08)Ab(FAz_SFDkkGJW16Xot^2_(WuGP*kQDAz%iTF66ka+9KdF$fh(gQYB|
zSJ~kq+7fcmcDOJxP@Tt3i|%xb1K*P6rqw{5AniUfP$(fVxQ|GHhE&+3K)M_E9xc}l
zZ5J|II2vN|8UX3wc5;?5jvP5YR7~Es$o;NI`_~WFkjy22KY^LXl}z*~aCJu@-Sp*!
z)aE77<UUGv8M|#4DeW-xqArzW)aOQ1v{G*<<;GeW>?~@~6SmQA;L?+>CF%8myRBp#
zDz_BGG>9%oO*L$k@tk5&iw0_l?c``Qa!!X21AVIdF6+L8sJdcu!j|wXLyt<@sR2j$
zG;VDIfdm~_JZuW}?2xR%^5t0}Wa^-9)zxs+i*2)n-d6DFc8SHH>;<>oXo&m;`G(f3
z(c#YJX{3fqqs(MBFJpXA5GI11@-BQII?){M%(a8pZLB<+<PF_B*UWl;=x&ZuLA6hD
z3)vC6(HUrBxlGd2<+3yai$OXdscyiU8AG&>e`GA6MTick63=OEL~`ZG4!Pz%ZtHTL
za3`43Hijy<KP6jj7jgqF?OUbf%Hcm`P=M;5dnP_tLy<aIryBOw(*j-U?s=eHN(VlB
zN{pxNngfOVHX90bT>r|u*PIc`y4hz2t^{!J>WA{)xE|W}nk=DFRLh$Hr(+nhZkCRW
zJ|>_tTV-iEHZM?W<2qO{y7cfYTw!7iFp1&bB6(##bLdc^K(_?Ar$&rN5J#-QO)qJa
zMp1PSbEvp)dv1p`aR=6fe&Dt!-caLtBq!(xR}bo@o_c}?vJ?MDeV)-p8#4|aibsLV
zczJ8@GZ>KX%1bB9Csndjd`gEqgqn++cI9^|(Kny!vZY?fkdR406Z(&495uf3h5879
cK3QXka4^cX@i8uaREFQ>@L2;Nra9Vw0VR%^C;$Ke

diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index 802f018150575..6fddeca9ad27b 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -52,27 +52,25 @@ file(
   "kps/*.cu"
   "legacy/kps/*.cu"
   )
-list(REMOVE_ITEM kernel_cu "gpu/check_numerics_kernel.cu" "gpu/cross_entropy_grad_kernel.cu" 
-	"gpu/instance_norm_grad_kernel.cu"
-	"gpu/cross_entropy_kernel.cu"
-	"gpu/cholesky_grad_kernel.cu"
-	"gpu/cholesky_solve_grad_kernel.cu"
-	"gpu/conv_transpose_kernel.cu"
-	"gpu/conv_grad_kernel.cu"
-	"gpu/solve_kernel.cu"
-	"gpu/solve_grad_kernel.cu"
-	"gpu/stft_kernel.cu"
-	"gpu/conv_kernel.cu"
-	"gpu/softmax_kernel.cu"
-	"gpu/slogdeterminant_grad_kernel.cu"
-	"gpu/spectral_norm_grad_kernel.cu"
-	"gpu/spectral_norm_kernel.cu"
-	"gpu/svd_grad_kernel.cu"
-        "gpu/conv_transpose_grad_kernel.cu")
-#file(
-#  GLOB kernel_cu
-#  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
-#  "gpu/scale_kernel.cu")
+list(REMOVE_ITEM kernel_cu
+     "gpu/check_numerics_kernel.cu"
+     "gpu/cross_entropy_grad_kernel.cu" 
+     "gpu/instance_norm_grad_kernel.cu"
+     "gpu/cross_entropy_kernel.cu"
+     "gpu/cholesky_grad_kernel.cu"
+     "gpu/cholesky_solve_grad_kernel.cu"
+     "gpu/conv_transpose_kernel.cu"
+     "gpu/conv_grad_kernel.cu"
+     "gpu/solve_kernel.cu"
+     "gpu/solve_grad_kernel.cu"
+     "gpu/stft_kernel.cu"
+     "gpu/conv_kernel.cu"
+     "gpu/softmax_kernel.cu"
+     "gpu/slogdeterminant_grad_kernel.cu"
+     "gpu/spectral_norm_grad_kernel.cu"
+     "gpu/spectral_norm_kernel.cu"
+     "gpu/svd_grad_kernel.cu"
+     "gpu/conv_transpose_grad_kernel.cu")
 
 if(APPLE OR WIN32)
   list(REMOVE_ITEM kernel_cu "fusion/gpu/fusion_group_kernel.cu")
diff --git a/paddle/phi/kernels/funcs/.im2col.cu.swp b/paddle/phi/kernels/funcs/.im2col.cu.swp
deleted file mode 100644
index 530e1d87fe3d7b003699505266a4bf81f84ee1b8..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 16384
zcmeHNZHya786HY$DYQwHwn+S-nqEa@=N$X)66M2p=WyI5sYQBM<t_<PNUS~YoIQ8@
z;qI>QqiKo?5~xCai2kUMP=J;eeu0n(0`VyXLPG5iB!tQzghWAzKU72r)RvFuo!Rl)
zzVq2pf(T+)dc2;Q_iNsHXLjb@T^u=6euN*+k1|}l7%Tkc`}UO&<!(FidBy_YkEpue
zHG1UG!6S#34vpFet?g(Fb>9_*hGpB1sJPA0ivJfHby-*ufhU}>u+;RbVL`fMRo}^1
zo7>4t+fomt9=IM4+{5;djgAn<LqiYpdv~9{9_Exb^+4)@)B~vpQV*maNIj5xAoW1%
zf&Xz2M2$PyhtY@I;(lC=?{De4|0}-CiJ{Wf{?EAmd|dy{uJ(V#?Q?Pc&RBub^Jd(B
zHg3Pz)&548{HtB<ugC4jV*Xumhm8EVI=!VHNIj5xAoW1%fz$)32T~8D9!NcqdLZ>c
z>Vf}D4_G#1yV3I(>7b1F|K|Mv!<~%%0C*CZ1nvj!1Af1Qu~&gF0vCb%fnVIi*w=xt
z0iOdt4txwa4Lkt+>2Aip3w#RrBybs60hWQYKpwaY_}x396L=PA0LOv%0N38Z*w2A4
z08L;P$N;wkFW<%3Gr$}$4qSUXW4{Kz0z3)457+_x>Q2T!0jvVUz#r~l>;=FBjsS;&
z*WSk1kAY_Z2^<B6fj4ev>?-hO;2baqj01muD`T$!SAnO2r+`bq8qfqH;4xqlH~_qg
zoWyT|Zv!E)3>1O8fmh&v8iN<MOjz(uSJn5zh=-9~EUt=b<OdVdi%NS?6G&U+*d_+v
zi^G<V6-nRg&4bS~ip3e>g<=6eez1p2m#9~LX>-3J0t@;x*^JgOBUWWqO#5CW)}uY7
zmi~ag(Oh(7waBWD6^6X*j!pZ{aeVDT#w!)-(~n}2s{O#VqGEAn`uJRqcSy|fg%ZES
zBH=b*x8M_6To;Y5TqrTVw*bH!=cU(ZMwM#a^5{F09AA=7B!Ws^$XY#W71yL4)pOi$
zMnr8I<DzQ3K=SZ$3ZJAu(&dwzWouoATbE+y!d^@OIFDd{lRM|O6<HM$KRiiVViUB=
zijpy|IEmX_Ma?DM^U|$Id;P*@lB0YfGm>S7gYKE2Y_IduB>G^qJn;lql@ok~<9{_F
zP?}qHXisGAZJ7*@#Fy@!E@+$JCasWsraQuim4+<2v14$kL61{KdP!d{a}1oo;2SNR
zM4`VVJ7G|xVtO>JI;2!|YE8t-C7w}TaI6gsG=FfC&tMQvsuE&w_TjVB**2}1XtCZl
z8K_g<n&fM+868-L8D?r#);cMbk0*IuQ_6PAx}jY11D+vO%jn(6IBpXrWOWqq;o)Ut
zR*P6_Vwn)@ARGs^G}Nu}LWs5eMm<+?op`QZK_$74;!p>`FuI4zai+$A%v0T^UTXn*
z{fVjfar)RxdbRezB_`E#azF=@HkBt_vF$BgW{imov!fLvFtKAcC*>X!CJCVKe?^3%
zE!O#ayFxN+#s%}HU$F3_v5-QW=k$NwqfH$`#3Xa4!=xew0U1>2G6)r|VadYWWUXj3
zwL{d=L{tyXmR)*a5FP!xsyI5dP@$*M#!xjZz3ntZ&@xl0(b1;Ieh|ZGUTaAc+-tfN
zYoVC}7{x`$udZN$%daC`YIthSNH@QMncHgEpx9R>YFbBAR1bt@D<+Bu{lFu;v>qBm
zD#ei*s&yReYv<zDKUlTI_Qw^%-E<)y_;|9kML`mw7=rdIelEAxH2t*Dt6k!db`JG_
z0Sj3CFxqNwplDQY&FavXO540GS;vAg5aj?u+ZUy6WyXX9h>X`870BC)>{yt#rA(d{
zgOSy@B{L7%enbQG)kI#7RVp>dU$mS`MQ{G}XS(sm&%_ID4!gH!*oiUEaz)s%s)DPH
z9-gR44fcO1vxlTtb(*%|L;5S>-d_gShqflJ`mXDH1<P$zqK$^AM1duvFyD)O(W<Tp
z&khT<Mw3u!1k#m}Tou>B<5<|m)-l<=C0QF>RlGAMhNLeOdN*HctFfG#AIPZg!iNr7
zr?%@MZ{oQSU`Z1EBsSdGo#$z@J1<0g9jC1U(S!|Al}ob99m}gVt(xF9e-+<c<LS(?
zg?D8b;t&vW%d^?p^1{P&rx*CysZ*z>W*5rG=K0(yK0P-(Q(h>~&7yRiPtBg=A1Kex
z<hX!sFhs030)dS^^hxZ^rLgnNmNcUln~|_Lo6BX09a2Kzdr;1YrsjEheu%$+YQ8+5
zW2Zfbga+cVrVJouaf4e8NUvIpjzC}5NS{>;1ll4WOD$i6uh6L+5B;TR%?gCxtJI^Y
zQ7jbJ*4FYCg0L>~eo!kw5P2>voGecto1H(lZ!C|`W6$|b?phn%UyLkBw6M@OP+ID6
ztQ9eA7Zsi7N4$xhX;ep-FyRH9Q56k(EP$1qVR;DqH8`qWM4rV1*Y2DtWt<t(96uuh
z8W;}4c!maYC}tYM^OF_!@@d!>&>04Mxhx+W85`q|s!#0oFP~~gbw3F6e9Ce7DIyH{
zDG`cbl}3op|2uIuJ&N-zo&S%;S&)};*8c|Z9Pnvi8u&ZT`hNp{1N;>DEN}_1fxW=J
zz?(Sl{|NXh@EPEvzzQ%AJOZ2q?gM^;^Zpg!46qA$3FrK$fhpj1oa?^@1i(Jvk2u#q
z4^)760lR^nzz*P-INQGnJO!Kp-VI#C`Tjcqo%cToJOI3g^ZoaLj{plm0r)G<_CEu@
z1k`{dz#YI#@WmBi8IZs{K>jHK>6LmQ^+3x5mpLyKc*k>`b=Rq<y<UVj?J-S-%KzB(
z#4Z*WmmJ@UMh}*HZfG@^;yLl?>1mT;YHd1n_Gj96#>#HL4Ab27@t#|xHa)Rzx0q?R
zkDQvgRUD<9bE_EhpNOcza+d#<h`L#(-JFQJS=QW?h`J7A*kxLC+LM=Nfg12G-F+oG
zG*2v)GTB}UjzMI#QXhR5r|o2sj6TP^?bvl&d>}tRdf^5vF+xUeM2J3|-<Tn>j2p6J
zpre0(#}1Yl=r5=5L4P0-HykH0!=$}*pdk9jq>IH6b75j)f--D5oPH>QhPN*xIm({V
zo1r|$L|ASnTPl^1NpSM5`J?67(%rA}bJVC4$;4|yJDy~UDweKOdlghw)^j?6rwSW5
z6VeMaS9)zEkLM_xZ)`V{mJ(Yk^`|%=NB(VeWb6QPZ%HNPWRucHoD(;TM_UV+_m7#)
zOj1v;ElhpatBr(cgESx9n%UIH%#`o!2r527ynN3dlO5MNd{vv|{W^6W-da^$y`84R
zTPZr+OV5F=^~BX%cICiU3Jw?rodZU08__B{8_4NsVErcrC{z}TUSFv5b(SNUfFnKP
zHbmM&cu{`HgbRsNI}Q;9IBXY-WpCATq&*eXnl30aL#^PY42yX$l8G0_iX)cr4swW*
u9J*T6or~^hXOSpdt0Es}R908U<Px!3B8P3lLXZDGKAJ@?do-Kn1@>=T`O(4v

diff --git a/paddle/phi/kernels/funcs/eigen/.extensions.h.swp b/paddle/phi/kernels/funcs/eigen/.extensions.h.swp
deleted file mode 100644
index e41cbfca9e327039544418de0e5a9e6d0fc9fc8d..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 16384
zcmeI3Uu+yl9ml7o(AH2|1&J4m_{Emc862O_N$liojEv*7N0M`~oir__$K3Ah-N4@N
zdUrNw69rHJsS=_Pq#_}~Ye+%x(n?4a)TT%f@xTMLf`|429{SK1B2;*2Lg71myY~5f
z{*&d80L@9CoNx9wzu(Mve!DxfJE@PIshpxCqa}vVeT;qW_*MJ<$3|{{t;$&F`$`w@
z_9-2moH#n^OqT2kzLRaS>9<5N;I{3GS}Trt`gbvC%Hn55=m|F}I<aR(MIjr)D~eSm
zyh!?9G}_$Bu{mZGFbeEJfqU4&@zR(!92(qD@7Z^H4|R-*QNSo*6fg=H1&jhl0i%FX
zz$ozlqJRqSU|&FIZcjUuE%<Lse`nu+Pe13=^0?>tTj}v9)AH**$N!cdpGwO=pE}O`
zr^V*WC}0#Y3K#{90!9I&fKk9GU=%P47zK<1MuC5W0^DZoUiiJC7vgyTpRNC&N8wMw
z8aNCFz@6abcQf`I@Ktaeym=2}&wxwd8{kXeA_&0>sDMFm<6VsX7Ca5EfNz1PKmZ;E
z<DdlI2kr)Uft&9{Ja8F&7d!!0z@y*@I0(kUUk4a_9y|xyU;&%}_k!Qw&Dcxe+n^2}
z2m8V6cQN)FxB^x{3w#Dh@F}nl+`N;qUx1&1?*joQzyNsR9f${h0xpA3gOgwgyzzF%
zu7g*=Q{bE668H)@0``N~?_lf?;Md>@a2AY!KfaBzSHUab8n_5NFc01f_JOzXu;$0$
zS?~<_9`Jz!J^{*L2;2s4VqEpuzS-wN3(<usM))j>l~{(N;z=dBD?i5-_FT|M$l-1z
z5{pVS=eGh^tY)PnX>r9<q9MZG;w9PegpE=a#zNZ)FK*F!7U8$0s2aDHLoQV`?YKTy
zrHL6T(*wd2p|tuOJI|Crbgl$V&oGv>I#Ot*MbdMnC-9}nmJN6;jTMIJJcUBVq1P2?
zQE+#rT;4F6q6=u(eC2`pYVFwknabRJ?fB{H9LjV#ZSNs&wbp7uC@iVBrWA&?hMQZX
zW0Bo!hwk@}*Qc@wBI#=9J9K5;X1kssTspv~Vt3Au6vpEaeQ2vF+2Ie<c)!E5wykT!
z9Z~u>JndxK$qvYnRNP{0`*GbB7>&##v+tjV8yV46v5#q%lN+g8>PkH#wH63lILWui
zTcS&db;l>A5+Oz}{nlffRqNQ`WZ~}A)<m{C=f`yFsc%9@$}X3;7@F+vccT99W_NN)
z%aPn0_5QWFo%!7%wG)%wGP~nv_r%WZ`sMYeODneh;exDBuW~u$?OKehcxj@h{8~Nn
zQJN@>tvZfXN-nVvE-Wo~uG2kN+io{mMS_Gv)vCOzFQGVzZ$0p&b7U+>VCgv$_tCZ7
zaUEyU$r0D}+aglAqhkAJo~TXPXzn3wHY|mbIB+;moc=e{TOZVehsN>@>iWe9xulax
z-xmIp(<U9Q&jhJo=Fjw-W%V`OJgMrPnN4*a)~9H4!(7JfD~_bsp!6<=TYI<f?B2FK
zvt9L0?ez5SshyVM6`>9m1>6!euQvi~)rX{Kxv?#1F!rK22>eh9yO<m<&YeA5DixQp
zSJ)2+yDU3fjg#%gmLCf2KgJzcoRgLqZEkDVYWZF<ZOU$DX@7S!%hF%lp?Qv{XCbxP
zY7p0IC<sE{XmRpAOE6X}k}F$M&u+2rnPS+6q?!UV5(^y^quOFgi0(3S{Ekf&U{>m2
zTI_PK5%Y$ihJOwltM!NlB5cVh*-cH{v$dOtE6WcqoL;7fXBQV|tIL)7C0badxrOSn
z%5r6)ieJZRw)zO2s8o-QkdSCbNMaRp-zXx4l$exDVJA(pB{E3XV~onyMw5&%J>M|E
z>VkVTIJ-oZr9t}G>{4ZEL^tTP=W5qsXFXz5eT{g47#7ET5uIx5sJsyh1W`U_{nQSz
zX7L&$6#0&7^H9Kz_M|^erGj#?*lxE+Ig+U<M*Xl+gg32YR6L0(e|2g8!1(A`TImr#
zrWRi#zpglZb7XmdUzjm&=ucc}|7)qK=PQaMJsRkuBhgB+YKcJUwk6piIfj+m(5~A{
zB3U}3-FnR??aan{gwBXij~JniRM4#$Oe+oI7SZYH9NHM-GOVaz4b%A8_&7b3Y_aeB
zOS7?R`e8Imv#v{v+AzYMJ`&+MJ#u>ee;MoHA7QPm*Z<l3`8TlEe-T^)Uk6Wui{K&f
z0q}lsKe!LPist}708fJ};5$HqNpKh(0tN6FJO}tWxC&kX&w?+4Q{V*nIG6zg;CER6
zzYJai*T9#-G$@0k-~hORwg0ccHSkOD95@Fa2akbIf(0-D{*3-z2Y=Gf1i*{nD$xD@
zAuu1KfKk9GU=%P47zK<1MuGoE1%`xYONR&(@b%@>h2&??ijnn};`IamH!0p7u=OYV
zoTjNx9|dLNQ>^RxQu<OMTcx)6sw`MojVVftK1h57)B8;&b^V}e^LvVoSarnWdy0jJ
zNB?B3w2g&w+2Qr5(0RL1vv{OBZ(C*xe0UhkhwVG8tM&Ab@6@}Eh=JdB`bWeHEqP<J
zsUh`U8UlA)8ggsGZ!|6Rv6kMZaXZVA)ZEgJy$OQ&ewZt2QZr{%OGr0YP+c)+%xEQ4
lxnk;yIb$mBHFL$(6?4XP{YI{ksFyRM2%TIJQE%6X{{XF__E`V`

diff --git a/paddle/phi/kernels/funcs/eigen/.slice.cu.swp b/paddle/phi/kernels/funcs/eigen/.slice.cu.swp
deleted file mode 100644
index 4a7c14392e2fa888dd926bec2df93fa056dc2421..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 12288
zcmeI2O>7%Q6vwB06jCT4pb|(N9xjTK%6J>Mq>Y<ea+4Gcty9@ftBQ)U_KxkL>z!qH
zHcmhV2?=rF$PI+V4Q_BmNWcM*%8@Gw!2u*d;=;!T3B-Ta>oks2+l_<}Xjb}Vv%7EJ
zzWKk|nHObh2hSEy)6vWf!*Lg5)-NAAzdxD2;g6pf3#3%~_NZfK?#S%ik;ai3XVxBO
zYqeyXTfXf$F2C50LOc20@>{~X!~>7Jq1A}IdT4Rc<en9}qRun*XqZ#;%qU<K7*m10
z>`->*pf=v0dXVn9du7aa#>^;S6fg=H1&jhl0i%FXz$jo8_&+G1{9WutbmWGlGmj_d
zU0cq{r8yY|i~>dhqkvJsC}0#Y3K#{90!9I&fKk9G@E=sbb{Kp4M#etB3Blw4|GmHe
zKfaT(kH8DyIM@g71>fJn*mvM9@D$hwzSzsyyWkb@G?)W_-Okvb;Aij^_!4{oUIi8K
z5ZDFYzm2gsK^YtXyTK=W7<(JkKn~1-3Gm|tW1oUIfDiH@2j;*8_~ur|-UBazG9d5(
zxF6gHzP*L9cYqI$0tT+^X6!W(fD*`nJ>Z9%8T$;p4m>aqCctmd=~wVI(3)KV<}eBv
z1^#6PE)!W6d3KwJzFp^pJw<Hmy?AdC_C7?O!y8mARjP$jwOFVYmrJbgYHCUEKdR{M
zM`?1@y3cdDx@`L{Uz>MiRC9TL`q=nQ8m_d}SS@O?>CEgn-p4YHgB|KEb~DsMcxv|W
zMK#K1=<^6pj*jqAlhTC~BE=!KtvH1$i0X<?u9Oz4%jNkvE4kdGezeF}<J?rwpIJ)N
zY<>rg9?2i03p;3cc!%u{?XX>ThwWzKcKb2Xg~$5``&1%mnzVk?G;myC?6OY8Wk4h&
zm2qOhr*N94<0QOHyf<%3HX0#UVVd}w;+_iQ^yxNzM_qcMqCybZ>xZ%@gqqwir{`Ip
zW%pvxzW1JMTNk}5KGWEJ5uf+kB!WzDXTBV%eoNgyZ5b+Ci8`%H;n32m?M`*W*gEJY
z2_mFXUHk*;T!%1|G_EG2l-P0qX}cqltKf>J$DO<$Mfyl?SGgC;;I!@czUA(X`D&WV
zws$Grvn$KaY5N%&q>)9H*V&0e;WcS6dB^cgAAK(v;xDmVL?ApKDET<XKKB^meeQI;
zEPl)2kHqjg``aB_<Mw*%*rx3}J<93Ou6N&Yjh=Vax%vTLHCwER=Xf}eEOj%uNzB-M
zz;Ymnjdsao9~;C=Z*Pojp6?TD%o`hv#Jpag^}<ik6M@7dDUY&!qZWwQeKfgx2h{<t
z*?!^G-N@mT>h4Ou=1yd;>w$~fj}p?$v{KtQMUBvwo|UZ3qs)?uVw730CeU`SXbU|}
zVL~Nc#RQ~UoK9ga#Bk2g0gF{Q-+Yf6vTGCiye=A;-(1^kMs|}^Q?6pqwf?Gn9<)Um
z#$IjDVdsj~$Cg*Bbgob?>k4V9Ld#`ZSS~FVb*+S}lT;|3rzeV~MNItyF$9>>{eXud
z;e7(jx8P2OIRZ%)wMcPiFkBKLZ8ne|*&d||6)ILzbfQoxR?=+6bG6feKO2bvUTW)P
z`|w`3QG}r{9sNGL8E`yP66vQ-AQU>4rcgFi#}2rjs)<@k`8mt#bUGOu1!9Y5WYDy9
zg$LzA>r`=JsZ?2dIGe%t*z+=?w!KcWrfhh$sV;q7HTs0L6<bg2JgG20Y2DGKhCpC^
zt<HU=!-<nDY!4-A2SROjkzq;VHj7h;gX^U<O=o$a6GI4$rgXAXNuw0jT4o)f1;iGJ
pW=qjjeVVcdvspS5Z?W(Gq(Y=xG6*wNa9t{E!;s26<iV=W(BH4L(g^?n

diff --git a/paddle/phi/kernels/gpu/.auc_kernel.cu.swp b/paddle/phi/kernels/gpu/.auc_kernel.cu.swp
deleted file mode 100644
index 19911fda28cf641c42a36c0dc36934a46c22f777..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4096
zcmYc?2=nw+u+TGP00IFZzJ4&JB|(QN*_eT$C_ldhBuoexTAG_#nx~l?rkEw-Q>&km
zpPQ;*keHH^lNz5}TAT=F>lb8X>Sw1G<)!8n>!%l#>L->a$3ysf$))&A9F-dlfzc2k
zJ_L9fjExKpK<bs16cvPpLW$Qps&X_0MnhmU1V%$(Gz3ONU^E0qLtr!nMnho8hX4R{
CQ6UTf


From db4200e8415b15554c50281fd0e7ed81be11583c Mon Sep 17 00:00:00 2001
From: "zhi.cai" <zhi.cai@mthreads.com>
Date: Tue, 1 Aug 2023 16:33:00 +0800
Subject: [PATCH 16/55] [MTAI] feat(build): add TODO comments for temporary
 solutions

---
 .../details/eager_deletion_op_handle.cc       |  16 +++++++-------
 .../details/fused_all_reduce_op_handle.cc     |   8 +++----
 .../fluid/framework/details/op_handle_base.cc |  18 ++++++++--------
 paddle/fluid/framework/fleet/box_wrapper.cu   |   2 +-
 paddle/fluid/framework/fleet/box_wrapper.h    |   4 ++--
 .../fluid/framework/fleet/box_wrapper_impl.h  |   2 +-
 paddle/fluid/framework/fleet/fleet_wrapper.cc |   2 +-
 paddle/fluid/framework/garbage_collector.cc   |   2 +-
 paddle/fluid/framework/ir/CMakeLists.txt      |   1 +
 paddle/fluid/framework/parallel_executor.cc   |   4 ++--
 .../inference/api/.resource_manager.h.swp     | Bin 16384 -> 0 bytes
 .../fluid/inference/api/analysis_predictor.cc |   2 +-
 .../inference/api/details/zero_copy_tensor.cc |   2 +-
 .../fluid/inference/api/resource_manager.cc   |   4 ++--
 .../memory/allocation/system_allocator.cc     |   1 +
 paddle/fluid/operators/affine_channel_op.cu   |   2 +-
 .../operators/collective/c_wait_comm_op.cc    |   2 +-
 .../operators/collective/c_wait_compute_op.cc |   2 +-
 .../fluid/operators/detection/bbox_util.cu.h  |   2 +-
 paddle/fluid/operators/fake_quantize_op.cu.h  |   2 +-
 .../fluid/operators/fused/attn_bias_add.cu.h  |   2 +-
 .../fused_embedding_eltwise_layernorm_op.cu   |   4 ++--
 .../operators/fused/fused_seqpool_cvm_op.cu   |   4 ++--
 .../operators/fused/multihead_matmul_op.cu    |   2 +-
 .../fluid/operators/fused/yolo_box_post_op.cu |  20 +++++++++---------
 .../fluid/operators/graph_khop_sampler_op.cu  |   4 ++--
 paddle/fluid/operators/lookup_table_v2_op.cu  |   2 +-
 .../operators/margin_cross_entropy_op.cu      |   4 ++--
 .../operators/math/bert_encoder_functor.h     |   4 ----
 paddle/fluid/operators/math/inclusive_scan.h  |   2 +-
 paddle/fluid/operators/math/sample_prob.cu    |   2 +-
 .../optimizers/distributed_fused_lamb_op.cu   |   8 +++----
 .../fluid/operators/reader/buffered_reader.cc |   2 +-
 paddle/fluid/operators/sync_batch_norm_op.cu  |   8 +++----
 paddle/fluid/platform/collective_helper.cc    |   2 +-
 paddle/fluid/platform/device/gpu/gpu_helper.h |   2 +-
 paddle/fluid/platform/device/gpu/gpu_info.cc  |   8 +++----
 paddle/fluid/platform/device/gpu/gpu_types.h  |   2 +-
 paddle/fluid/platform/device_context.h        |   2 +-
 paddle/fluid/platform/enforce.h               |   1 +
 paddle/fluid/platform/profiler.cu             |   2 +-
 paddle/fluid/pybind/parallel_executor.cc      |   2 +-
 paddle/fluid/pybind/place.cc                  |   8 +++----
 43 files changed, 87 insertions(+), 88 deletions(-)
 delete mode 100644 paddle/fluid/inference/api/.resource_manager.h.swp

diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
index 3e204548fa151..9956a2301cebf 100644
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
@@ -44,13 +44,13 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
       place_(place),
       var_infos_(vars.begin(), vars.end()),
       gc_(gc) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (platform::is_gpu_place(place)) {
     dev_ctx_ = reinterpret_cast<phi::GPUContext *>(
         platform::DeviceContextPool::Instance().Get(place));
     if (dynamic_cast<StreamGarbageCollector *>(gc_)) {
       platform::CUDADeviceGuard guard(place.device);
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipEventCreateWithFlags(&event_, hipEventDisableTiming));
 #elif defined(PADDLE_WITH_MUSA)
@@ -78,11 +78,11 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
 }
 
 EagerDeletionOpHandle::~EagerDeletionOpHandle() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (event_) {
     auto gpu_place = dev_ctx_->GetPlace();
     platform::CUDADeviceGuard guard(gpu_place.device);
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event_));
 #elif defined(PADDLE_WITH_MUSA)
     PADDLE_ENFORCE_GPU_SUCCESS(musaEventDestroy(event_));
@@ -94,7 +94,7 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() {
 }
 
 void EagerDeletionOpHandle::InitCUDA() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   int dev_id = dev_ctxes_.begin()->first.device;
   events_[dev_id] = nullptr;
 #endif
@@ -182,13 +182,13 @@ void EagerDeletionOpHandle::RunImpl() {
 
 void EagerDeletionOpHandle::ClearGarbages(
     std::deque<std::shared_ptr<memory::Allocation>> *garbages) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (event_) {
     auto compute_stream = dev_ctx_->stream();
     auto callback_stream =
         reinterpret_cast<StreamGarbageCollector *>(gc_)->stream();
     auto callback_func = [=]() {
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, compute_stream));
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipStreamWaitEvent(callback_stream, event_, 0));
@@ -206,7 +206,7 @@ void EagerDeletionOpHandle::ClearGarbages(
   } else {
 #endif
     gc_->Add(std::move(*garbages));
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   }
 #endif
 }
diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
index b07211a6b18d7..9d91b8f7fa416 100644
--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
@@ -64,7 +64,7 @@ FusedAllReduceOpHandle::~FusedAllReduceOpHandle() {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   auto destroy_event = [](gpuEvent_t event) {
     if (event == nullptr) return;
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event));
 #elif defined(PADDLE_WITH_MUSA)
     PADDLE_ENFORCE_GPU_SUCCESS(musaEventDestroy(event));
@@ -102,7 +102,7 @@ void FusedAllReduceOpHandle::RunImpl() {
                           "when using GPU device."));
     auto create_event = [](gpuEvent_t *event) {
       if (*event) return;
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipEventCreateWithFlags(event, hipEventDisableTiming));
 #elif defined(PADDLE_WITH_MUSA)
@@ -127,7 +127,7 @@ void FusedAllReduceOpHandle::RunImpl() {
     auto flat_nccl_ctxs = nccl_ctxs_->GetFlatCtx(run_order_);
     auto &nccl_ctx = flat_nccl_ctxs->at(gpu_place.device);
     nccl_stream = nccl_ctx.stream();
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(start_event_, compute_stream));
     PADDLE_ENFORCE_GPU_SUCCESS(
         hipStreamWaitEvent(nccl_stream, start_event_, 0));
@@ -196,7 +196,7 @@ void FusedAllReduceOpHandle::RunImpl() {
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   if (FLAGS_allreduce_record_one_event) {
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(end_event_, nccl_stream));
     PADDLE_ENFORCE_GPU_SUCCESS(
         hipStreamWaitEvent(compute_stream, end_event_, 0));
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index 69fb0df678f65..55f8677855cf1 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -31,7 +31,7 @@ std::string OpHandleBase::DebugString() const {
 }
 
 OpHandleBase::~OpHandleBase() PADDLE_MAY_THROW {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   for (auto &ev : events_) {
     if (ev.second) {
 #if defined(PADDLE_WITH_HIP)
@@ -47,7 +47,7 @@ OpHandleBase::~OpHandleBase() PADDLE_MAY_THROW {
 }
 
 void OpHandleBase::InitCUDA() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   for (auto &p : dev_ctxes_) {
     int dev_id = p.first.device;
     platform::SetDeviceId(dev_id);
@@ -141,7 +141,7 @@ void OpHandleBase::InitXPU() {
 }
 
 void OpHandleBase::Run(DeviceType use_device) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (events_.empty() && use_device == p::kCUDA && dev_ctxes_.size() > 0) {
     InitCUDA();
   }
@@ -177,7 +177,7 @@ void OpHandleBase::Run(DeviceType use_device) {
 }
 
 void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   PADDLE_ENFORCE_NOT_NULL(
       waited_ctx,
       platform::errors::InvalidArgument("Argument waited_ctx is NULL."));
@@ -228,7 +228,7 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) {
       if (in_var_handle) {
         auto &place = in_var_handle->place();
         if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
           auto stream =
               static_cast<phi::GPUContext *>(dev_ctxes_.at(place))->stream();
 #if defined(PADDLE_WITH_HIP)
@@ -258,7 +258,7 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) {
         if (in_var_handle) {
           auto &place = in_var_handle->place();
           if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
             platform::DeviceContextPool &pool =
                 platform::DeviceContextPool::Instance();
             auto stream =
@@ -283,7 +283,7 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
       auto *in_var_handle = dynamic_cast<VarHandle *>(in_var);
       if (in_var_handle) {
         if (platform::is_gpu_place(in_var_handle->place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
           auto stream = static_cast<phi::GPUContext *>(
                             dev_ctxes_.at(in_var_handle->place()))
                             ->stream();
@@ -324,7 +324,7 @@ bool OpHandleBase::NeedWait(VarHandleBase *in_var) {
 
 void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
   callback();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (!events_.empty()) {  // Use event
     for (auto &p : dev_ctxes_) {
       auto dev_id = p.first.device;
@@ -347,7 +347,7 @@ void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
 
 void OpHandleBase::RunAndRecordEvent(platform::Place p,
                                      const std::function<void()> &callback) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (platform::is_cpu_place(p) || events_.empty()) {
     callback();
   } else {
diff --git a/paddle/fluid/framework/fleet/box_wrapper.cu b/paddle/fluid/framework/fleet/box_wrapper.cu
index 389b1f99eed72..e370631443d56 100644
--- a/paddle/fluid/framework/fleet/box_wrapper.cu
+++ b/paddle/fluid/framework/fleet/box_wrapper.cu
@@ -156,7 +156,7 @@ void BoxWrapper::CopyForPull(const paddle::platform::Place& place,
                     ->stream();
   auto buf_value = memory::Alloc(place, values.size() * sizeof(float*));
   float** gpu_values = reinterpret_cast<float**>(buf_value->ptr());
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
   hipMemcpy(gpu_values,
             values.data(),
             values.size() * sizeof(float*),
diff --git a/paddle/fluid/framework/fleet/box_wrapper.h b/paddle/fluid/framework/fleet/box_wrapper.h
index 054298795305e..b3432277805a7 100644
--- a/paddle/fluid/framework/fleet/box_wrapper.h
+++ b/paddle/fluid/framework/fleet/box_wrapper.h
@@ -593,9 +593,9 @@ class BoxWrapper {
       auto* gpu_data = gpu_tensor.data<T>();
       auto len = gpu_tensor.numel();
       data->resize(len);
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
       hipMemcpy(data->data(), gpu_data, sizeof(T) * len, hipMemcpyDeviceToHost);
-#elif defined(PADDLE_WITH_HIP)
+#elif defined(PADDLE_WITH_MUSA)
       musaMemcpy(
           data->data(), gpu_data, sizeof(T) * len, musaMemcpyDeviceToHost);
 #else
diff --git a/paddle/fluid/framework/fleet/box_wrapper_impl.h b/paddle/fluid/framework/fleet/box_wrapper_impl.h
index 09696c824fbbd..7b97fcbf17ec9 100644
--- a/paddle/fluid/framework/fleet/box_wrapper_impl.h
+++ b/paddle/fluid/framework/fleet/box_wrapper_impl.h
@@ -61,7 +61,7 @@ void BoxWrapper::PullSparseCase(const paddle::platform::Place& place,
         memory::Alloc(place, slot_lengths.size() * sizeof(int64_t));
     uint64_t** gpu_keys = reinterpret_cast<uint64_t**>(buf_key->ptr());
     int64_t* gpu_len = reinterpret_cast<int64_t*>(buf_length->ptr());
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
     hipMemcpy(gpu_keys,
               keys.data(),
               keys.size() * sizeof(uint64_t*),
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc
index f2c6892c6cd11..7ac9e4f7302a6 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.cc
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc
@@ -813,7 +813,7 @@ void FleetWrapper::PushDenseVarsAsync(
                  g_data,
                  sizeof(float) * count,
                  stream);
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, stream));
     hipEventSynchronize(event);
 #elif defined(PADDLE_WITH_MUSA)
diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc
index 1d3937ba2b982..b39c11f3f4106 100644
--- a/paddle/fluid/framework/garbage_collector.cc
+++ b/paddle/fluid/framework/garbage_collector.cc
@@ -91,7 +91,7 @@ StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place,
                                                size_t max_memory_size)
     : GarbageCollector(place, max_memory_size) {
   platform::CUDADeviceGuard guard(place.device);
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
   PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&stream_));
 #elif defined(PADDLE_WITH_MUSA)
   PADDLE_ENFORCE_GPU_SUCCESS(musaStreamCreate(&stream_));
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index cdb0c3596a683..6031a5e8afcf9 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -192,6 +192,7 @@ if(WITH_MKLDNN)
   pass_library(operator_unsqueeze2_onednn_fuse_pass inference DIR mkldnn)
   pass_library(operator_reshape2_onednn_fuse_pass inference DIR mkldnn)
   pass_library(cpu_quantize_placement_pass base DIR mkldnn)
+  # TODO(@caizhi): enable it
   #pass_library(cpu_quantize_pass inference DIR mkldnn)
   pass_library(cpu_quantize_squash_pass inference DIR mkldnn)
   pass_library(reshape_transpose_matmul_mkldnn_fuse_pass inference DIR mkldnn)
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index ccf4534bddbb2..b827c87aae5cd 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -638,7 +638,7 @@ void InitP2P(const std::vector<platform::Place> &places) {
       for (int j = 0; j < count; ++j) {
         if (devices[i] == devices[j]) continue;
         int can_acess = -1;
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
         hipError_t ret =
             hipDeviceCanAccessPeer(&can_acess, devices[i], devices[j]);
         if (ret != hipSuccess || can_acess != 1) {
@@ -655,7 +655,7 @@ void InitP2P(const std::vector<platform::Place> &places) {
                        << " to " << devices[j];
         } else {
           platform::CUDADeviceGuard guard(devices[i]);
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
           hipDeviceEnablePeerAccess(devices[j], 0);
 #elif defined(PADDLE_WITH_MUSA)
           musaDeviceEnablePeerAccess(devices[j], 0);
diff --git a/paddle/fluid/inference/api/.resource_manager.h.swp b/paddle/fluid/inference/api/.resource_manager.h.swp
deleted file mode 100644
index b4cebabe4d0f8260883b5fa421aa42bae251d8e0..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 16384
zcmeI3Uu+yl9mlr;TIe4Nt&l1ZXzD<*Epr<?YC<pWNtZjvx#EBF+4&=omc8A%yHWOE
zW@lrYG)+bNL=h4KB&1g20rd$gRs2z^3Mdc!5vWKI1TAlrH{b!4KY5@i;ybguce{6M
zJ4Hl7Xf1twzMbFvcIG?3ot^pZx^w7MYngn!GOOX|J(~8}PrYpav~lRb55A&l-1mh%
zyx-64BS()sa<qGN);?nHWvlmnm+FCO+YU9{C^VD5^{x{!TW4OEa_U)BHv^_~8u}5p
zU}$<~kMc@?FV|&RsX(c~z6#u<9iE##B+U*^JV@@pZ*$)or9r7csX(bfsX(bfsX(bf
zsX(bfslfk_0wOq|T}F2fBwg~8&)Y^m-<Eu@CZEAbdGfXVC>1CbC>1CbC>1CbC>1Cb
zC>1CbC>1CbC>1Cb_#afjwBts-CO7<e{-5suU%pq<z6W-|!{APE2l&mqHSHRB8k_+C
zyhqc%4ZaAT0$0F!@JX-)s$dFCf*W^h+ONTL;7M=_Y=RqiY1&V~R{;Y@fDS6)1K@V>
zr#m(6SKyc6yWpGP%fJMm2B*M0I0k0H^>=C7-@sqNGvG<E1!lnm;9u|5v^T*I!4)tM
z-nc{4UH~qb0e`+7=LJ6l&w<Ne6?_D|`3_C{Id}#*U;#V`ZoFO7eh0n-J`c`<GhhWw
zg1f+9kW9G-UIJeOS3n;;26XUVa2*Mm*TAdb7vM#36+8=`0xmcW=D~-+KQMlO2WlKK
zl!SDA-y!~%M{Oe%oSLq2QSgXfIwlK4VOOgr=jP5l^TZ4}TqCZzZS**^4V$?}$UZ}j
zV=zRiDiM9SutOuD+~_#Ib#B*EHy9R0j$lT>eT#<S;Hb^n1$Y4=^S!}g9(jVfbnLk0
zyFnzVVVQy1VGa{JsDhRSeO9fmIi^KPAb(3o0bHAcB1uKa1)tfZNzr$IryZC)q$f=e
zZ=Os}$$qKQmdC^{MQKIfK|6~p78fNIk2_|#Bnm6;Qk1DURxK%Odfr%(>_dFqu}Ay1
z7(@%KN4+M!z^sD8HSS|P1*2g`l;m8CYn$uID>#`#dA4kclyMr-B;!l!njL_G7+a(Q
zZCW<B7ZGSp=C%=LlT*a<y-=jlHn7=<LvD@o<1*lG!frVzGFsbQmT`#A$}sdfI$Lzc
zrV7hbaa0zYtj8g`r%fXc*Tx`Pv#!)=RIAV+9+GJi_ck|5!^Wu)4>PfA2w5Q?vZLmi
zuuRO=k>((Dh7~%QAdPMFv6;7wLvLueapLe?8jkx|QV+T}d_K4Y_qHGRYNwGOpFVRT
z3_bONs2xvqu}bRiq58P!RYZ0p&k~q;=EnwOa;i2y9u0)H@OgX<<Q)_YPvRmwLi;+%
z4Or?RH)i{}$d6;<B0H2f;~*X0Om#WJ8zW=)pSt?*xQQFNUmG&flJsT}Fqx|Pdx*M|
z45NjRCos=7d{5AAk)H_ZcHl6JA;`uJ)J&GSH9ZvHn`YGgCO1(=9XP3K7B0cCz!SNs
z9b{c%(=Njud~&VcY%VPrr&}8*jmBoPK1CjXys+?OYi-xE<;`||O4G0cNG^rQW9K7k
z1cIj*Lz8w*OqTKNR3lap8C2du3|rko3h%M<p3oS%TV(G8$#s@<ttLKH+GXNHNs;tH
zPPB0&h^Ad4MAwODx)c*9)&^L>Q0|#dqO>|8=HGaxi2Hd=8#jlm;lcQoRq6d8^=#I~
z(0G;;*_2GgMLMPey<=MEP#o%*h$Ke(3Ts&DPuye`n_A9Ins<G^WpcaNI)0-T>mmU&
zSC2<;&@izR(4^k239+F*krzfm;B!H3U1_g3&YYQ@)#GVp7g@06bA3-&%ewuwLy+ow
ziXjI_zEYeFnl98_ROom_H`lbduap{&-@J>}=yol8fVw{4$sI9+?AP>`V>;QRxUIO%
zsv)QXZuVT0_;~kOL-0DxWx|yAsE|aSO*s*LN|vyM^FmsY7Hvwi#|XNwzJ~#|SeIGE
zF}+@d*QQC&$NJS{UXNg<=dw^PxkG}5uyiA*lhw@)a=O00USHX0Ewst%I%%w~G+P_3
z)fId@LFy}K$tPMX%^5-&n!yRCKLmD2a7yN4CZx7%lFhIRWF2T@k2jjIkYo*f4^1Ew
z^)_j>C&=-7yVahN4chb^>6+8?5#w;w*&${iXIr!5P;_cbp4IGeiZbEj+9q3^3G{A;
zgnn1xdXNH2Px7ZPL{QcBt*xyJR%2G5R(#&m;Z5oo>PxN0!b*GLV{?^5Nu{%Xgw4(l
z@jJqVZxd^a0N=VftxX~Q?<A(4FGv*1(Lfiw46UTw77c`Ko7}lL%*3dYEohhZq9jX3
zG^<x#wKE;-8ImkD3ED^|WfV*#l_s#-j(gfb8#(Ls1(~#_$lRg1IkKi+v5$X~dL;T-
z^j1jSamczf3<+*%lwXh|C-?t8_P!zZ#B%?iKHvW%_WIuf=Rp7-0}p`LvFE=Ez65OW
zF!&&NKe!*<2V_p*4R8&71N1-(EPy7M12QLYFL)jM|KEe(g71UN;0f?i@IG)icm@0a
zuLGGISOB*H4P3`Q|5flJxC)*JGACew4}(9TPp^O<gCBvHz!$*N;B!FsnS=7DRG?I#
zRG?I#RG?I#RNxjXFuFsHw~)!cyxD5kk7K9bSY117)K{8Dz1?mtu1uyWh~@adA@jM`
zWLK&dy(G8uRM~ZvwQ|j1KF-$a({a9bTF0tcJ_pF80mf4rwVIY>3-<63Z)N18gjx-&
z_pV|2NR_ISW36l^31TdWIkpagk3BRtBv`<sgmR>x<r*6$>K{HUgY(IKmg<h$aqW_O
zu#le%>Evi4I!*uYQk>0Cs7C=v38WIYu~@hIae^T6>pST~lr)p$(XHgMih7(NACs)p
zZbNoE-<Dy1a$MUu{hLXh<a#lzsajZ&m(Q7}Pc^2=Xp;UdC#6&h1<jTF1T`ku?B1G0
z!FVESJgXC@q0;a$E_#vUAZ0a7W2x{^xvIiI8TXiGpVIDS(|$_p!jx8$d{W}T=TIC3
z>?J@D!Xn}xnI5z?`QK$ZG_t4I(_4dT$(1}k2>kRvoynsF(KTupC+&(nJcI#<Z)`Ks
zkBc}}+O6aaIQpo}70SdOyyTN;Bet0wm(mwkFH)zceSS`5)8h0iPEnBhCa}OmHlwee
zvgL;<HqRxxv0;~w)(Xut`CEd&3vKc_Zz8`sVpq#FxvlIB(LFq8#91KBRJ^#&(&I=q
zl{b1THO94ocB6A{nxU*#i;|Ny<XvO}HlDMG^Lb{{R@E|3m1=6zjH?avw(?*+so@#Q
Hz}vq8%#5C7

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index a59db2d3c062a..45941a01f157b 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -2160,7 +2160,7 @@ bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) {
   }
 
   if (stream != predictor_stream_) {
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
     hipStreamSynchronize(static_cast<gpuStream_t>(predictor_stream_));
 #elif defined(PADDLE_WITH_MUSA)
     musaStreamSynchronize(static_cast<gpuStream_t>(predictor_stream_));
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
index 8791d6dfe0266..77518f9da4557 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -420,7 +420,7 @@ void Tensor::CopyToCpuImpl(T *data,
                          t_data,
                          ele_num * sizeof(T),
                          dev_ctx->stream());
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
     hipStreamSynchronize(dev_ctx->stream());
 #elif defined(PADDLE_WITH_MUSA)
     musaStreamSynchronize(dev_ctx->stream());
diff --git a/paddle/fluid/inference/api/resource_manager.cc b/paddle/fluid/inference/api/resource_manager.cc
index 9f5df0edfa06c..2ec8e521b6eab 100644
--- a/paddle/fluid/inference/api/resource_manager.cc
+++ b/paddle/fluid/inference/api/resource_manager.cc
@@ -99,7 +99,7 @@ class EigenGpuStreamDevice : public Eigen::StreamInterface {
     if (semaphore_ == NULL) {
       char* scratch = static_cast<char*>(scratchpad()) + Eigen::kGpuScratchSize;
       semaphore_ = reinterpret_cast<unsigned int*>(scratch);
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream_));
 #elif defined(PADDLE_WITH_MUSA)
@@ -159,7 +159,7 @@ void GPUContextResource::InitGPUResource(void* stream) {
 
 void GPUContextResource::DestroyGPUResource() {
   if (owned_stream_) {
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(stream_));
 #elif defined(PADDLE_WITH_MUSA)
     PADDLE_ENFORCE_GPU_SUCCESS(musaStreamDestroy(stream_));
diff --git a/paddle/fluid/memory/allocation/system_allocator.cc b/paddle/fluid/memory/allocation/system_allocator.cc
index 2a26ff170ffdf..a3017ba082cc1 100644
--- a/paddle/fluid/memory/allocation/system_allocator.cc
+++ b/paddle/fluid/memory/allocation/system_allocator.cc
@@ -273,6 +273,7 @@ void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
   }
 #else
   err = cudaFreeHost(p);
+
   // Purposefully allow cudaErrorCudartUnloading, because
   // that is returned if you ever call cudaFreeHost after the
   // driver has already shutdown. This happens only if the
diff --git a/paddle/fluid/operators/affine_channel_op.cu b/paddle/fluid/operators/affine_channel_op.cu
index 62c270bfd0311..7adcc1e09c24a 100644
--- a/paddle/fluid/operators/affine_channel_op.cu
+++ b/paddle/fluid/operators/affine_channel_op.cu
@@ -17,7 +17,7 @@ limitations under the License. */
 #endif
 
 #ifdef __MUSACC__
-#include "cub/cub.cuh"
+#include <cub/cub.cuh>
 #endif
 
 #ifdef __HIPCC__
diff --git a/paddle/fluid/operators/collective/c_wait_comm_op.cc b/paddle/fluid/operators/collective/c_wait_comm_op.cc
index f3a34f2c7d057..e5c918f0be9d0 100644
--- a/paddle/fluid/operators/collective/c_wait_comm_op.cc
+++ b/paddle/fluid/operators/collective/c_wait_comm_op.cc
@@ -57,7 +57,7 @@ class CWaitCommOp : public framework::OperatorBase {
         platform::NCCLCommContext::Instance().Get(ring_id, place)->comm_event();
 
 // comm_stream-->event-->compute_stream
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, comm_stream));
     PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(compute_stream, event, 0));
 #elif defined(PADDLE_WITH_MUSA)
diff --git a/paddle/fluid/operators/collective/c_wait_compute_op.cc b/paddle/fluid/operators/collective/c_wait_compute_op.cc
index 4b9ca005be397..5276b1b15bcf8 100644
--- a/paddle/fluid/operators/collective/c_wait_compute_op.cc
+++ b/paddle/fluid/operators/collective/c_wait_compute_op.cc
@@ -58,7 +58,7 @@ class CWaitComputeOp : public framework::OperatorBase {
                      ->compute_event();
 
 // compute_stream-->event-->comm_stream
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, compute_stream));
     PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(comm_stream, event, 0));
 #elif defined(PADDLE_WITH_MUSA)
diff --git a/paddle/fluid/operators/detection/bbox_util.cu.h b/paddle/fluid/operators/detection/bbox_util.cu.h
index c5ea2218d996e..c20b691a4300c 100644
--- a/paddle/fluid/operators/detection/bbox_util.cu.h
+++ b/paddle/fluid/operators/detection/bbox_util.cu.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "cub/cub.cuh"
 #endif
 #ifdef __MUSACC__
-#include "cub/cub.cuh"
+#include <cub/cub.cuh>
 #endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
diff --git a/paddle/fluid/operators/fake_quantize_op.cu.h b/paddle/fluid/operators/fake_quantize_op.cu.h
index 1bedf6cc54a4e..507cbc0d31d3a 100644
--- a/paddle/fluid/operators/fake_quantize_op.cu.h
+++ b/paddle/fluid/operators/fake_quantize_op.cu.h
@@ -190,7 +190,7 @@ struct FindChannelAbsMaxFunctor<phi::GPUContext, T> {
       int grid = cout;
       int max_threads = 1024;
 
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
       hipMemset(out_abs_max, 0, sizeof(T) * cout);
 #elif defined(PADDLE_WITH_MUSA)
       musaMemset(out_abs_max, 0, sizeof(T) * cout);
diff --git a/paddle/fluid/operators/fused/attn_bias_add.cu.h b/paddle/fluid/operators/fused/attn_bias_add.cu.h
index b8d66efffee0a..db5c1ddcbb375 100644
--- a/paddle/fluid/operators/fused/attn_bias_add.cu.h
+++ b/paddle/fluid/operators/fused/attn_bias_add.cu.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "cub/cub.cuh"
 #endif
 #ifdef __MUSACC__
-#include "cub/cub.cuh"
+#include <cub/cub.cuh>
 #endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
diff --git a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
index 0216564ed80a4..65d4ae2d4c5ec 100644
--- a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
+++ b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
@@ -44,7 +44,7 @@ class EmbeddingEltWiseLayerNormKernel : public framework::OpKernel<T> {
             framework::TransToPhiDataType(framework::proto::VarType::INT64));
     framework::DDim in_dim{input_num};
     int device_id;
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
     hipGetDevice(&device_id);
 #elif defined(PADDLE_WITH_MUSA)
     musaGetDevice(&device_id);
@@ -67,7 +67,7 @@ class EmbeddingEltWiseLayerNormKernel : public framework::OpKernel<T> {
       in1s.push_back(reinterpret_cast<uintptr_t>(ids[i]->data<int64_t>()));
       in2s.push_back(reinterpret_cast<uintptr_t>(embs[i]->data<T>()));
     }
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
     hipMemcpyAsync(in_ids_d,
                    in1s.data(),
                    sizeof(int64_t) * input_num,
diff --git a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
index f038190e72927..89838e6084ab3 100644
--- a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
+++ b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
@@ -122,7 +122,7 @@ void FusedSeqpoolCVM(const framework::ExecutionContext
       memory::AllocShared(ctx.GetPlace(), total_ptr_len * sizeof(void *));
   void *ptr = temp_ptr->ptr();
 
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
   T **gpu_input_values = reinterpret_cast<T **>(temp_ptr->ptr());
   platform::GpuMemcpyAsync(gpu_input_values,
                            input_data.data(),
@@ -353,7 +353,7 @@ void FusedSeqpoolCVMGrad(const framework::ExecutionContext &ctx,
                          cvm_data.size() + lods.size();
   auto temp_ptr =
       memory::AllocShared(ctx.GetPlace(), total_ptr_len * sizeof(void *));
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
   T **gpu_out_grads_values = reinterpret_cast<T **>(temp_ptr->ptr());
   platform::GpuMemcpyAsync(gpu_out_grads_values,
                            out_grads_data.data(),
diff --git a/paddle/fluid/operators/fused/multihead_matmul_op.cu b/paddle/fluid/operators/fused/multihead_matmul_op.cu
index 36d0de8c6c9d1..5ee1ce015386f 100644
--- a/paddle/fluid/operators/fused/multihead_matmul_op.cu
+++ b/paddle/fluid/operators/fused/multihead_matmul_op.cu
@@ -327,7 +327,7 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
       temp_bias_tensor.Resize({size});
       auto *temp_qk_bias = device_ctx.template Alloc<T>(
           &temp_bias_tensor, temp_bias_tensor.numel() * sizeof(T));
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
       hipMemset(temp_qk_bias, 0, sizeof(float) * size);
 #elif defined(PADDLE_WITH_MUSA)
       musaMemset(temp_qk_bias, 0, sizeof(float) * size);
diff --git a/paddle/fluid/operators/fused/yolo_box_post_op.cu b/paddle/fluid/operators/fused/yolo_box_post_op.cu
index 6b8874d289c77..6952720205475 100644
--- a/paddle/fluid/operators/fused/yolo_box_post_op.cu
+++ b/paddle/fluid/operators/fused/yolo_box_post_op.cu
@@ -268,7 +268,7 @@ static void YoloTensorParseCuda(
                                                          class_num,
                                                          anchors_num,
                                                          prob_thresh);
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
   hipMemcpy(
       &bbox_count, bbox_count_device_ptr, sizeof(int), hipMemcpyDeviceToHost);
 #elif defined(PADDLE_WITH_MUSA)
@@ -286,7 +286,7 @@ static void YoloTensorParseCuda(
   float* bbox_tensor = *bboxes_tensor_ptr;
   // Update previous maximum bbox number
   if (bbox_count > *bbox_count_max_alloc) {
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
     hipFree(bbox_tensor);
     hipMalloc(&bbox_tensor, bbox_count * (5 + class_num) * sizeof(float));
 #elif defined(PADDLE_WITH_MUSA)
@@ -302,7 +302,7 @@ static void YoloTensorParseCuda(
 
   // Now generate bboxes
   int bbox_index = 0;
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
   hipMemcpy(
       bbox_index_device_ptr, &bbox_index, sizeof(int), hipMemcpyHostToDevice);
 #elif defined(PADDLE_WITH_MUSA)
@@ -361,7 +361,7 @@ class YoloBoxPostKernel : public framework::OpKernel<T> {
     anchors.insert(anchors.end(), anchors1.begin(), anchors1.end());
     anchors.insert(anchors.end(), anchors2.begin(), anchors2.end());
     int* device_anchors;
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
     hipMalloc(reinterpret_cast<void**>(&device_anchors),
               anchors.size() * sizeof(int));
     hipMemcpy(device_anchors,
@@ -403,7 +403,7 @@ class YoloBoxPostKernel : public framework::OpKernel<T> {
     int batch = context.Input<phi::DenseTensor>("ImageShape")->dims()[0];
     TensorInfo* ts_info = new TensorInfo[batch * boxes_input.size()];
     for (int i = 0; i < batch * static_cast<int>(boxes_input.size()); i++) {
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
       hipMalloc(
           reinterpret_cast<void**>(&ts_info[i].bboxes_dev_ptr),
           ts_info[i].bbox_count_max_alloc * (5 + class_num) * sizeof(float));
@@ -418,7 +418,7 @@ class YoloBoxPostKernel : public framework::OpKernel<T> {
 #endif
       ts_info[i].bboxes_host_ptr = reinterpret_cast<float*>(malloc(
           ts_info[i].bbox_count_max_alloc * (5 + class_num) * sizeof(float)));
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
       hipMalloc(reinterpret_cast<void**>(&ts_info[i].bbox_count_device_ptr),
                 sizeof(int));
 #elif defined(PADDLE_WITH_MUSA)
@@ -433,7 +433,7 @@ class YoloBoxPostKernel : public framework::OpKernel<T> {
     // Box index counter in gpu memory
     // *bbox_index_device_ptr used by atomicAdd
     int* bbox_index_device_ptr;
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
     hipMalloc(reinterpret_cast<void**>(&bbox_index_device_ptr), sizeof(int));
 #elif defined(PADDLE_WITH_MUSA)
     musaMalloc(reinterpret_cast<void**>(&bbox_index_device_ptr), sizeof(int));
@@ -478,7 +478,7 @@ class YoloBoxPostKernel : public framework::OpKernel<T> {
                       bbox_count_max_alloc * (5 + class_num) * sizeof(float)));
         }
 // we need copy bbox_count_host boxes to cpu memory
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
         hipMemcpyAsync(
             ts_info[ts_id].bboxes_host_ptr,
             ts_info[ts_id].bboxes_dev_ptr,
@@ -566,7 +566,7 @@ class YoloBoxPostKernel : public framework::OpKernel<T> {
       boxes_num_data[batch_id] = bbox_det_vec.size();
     }
 
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
     hipFree(bbox_index_device_ptr);
 #elif defined(PADDLE_WITH_MUSA)
     musaFree(bbox_index_device_ptr);
@@ -574,7 +574,7 @@ class YoloBoxPostKernel : public framework::OpKernel<T> {
     cudaFree(bbox_index_device_ptr);
 #endif
     for (int i = 0; i < batch * boxes_input.size(); i++) {
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
       hipFree(ts_info[i].bboxes_dev_ptr);
       hipFree(ts_info[i].bbox_count_device_ptr);
 #elif defined(PADDLE_WITH_MUSA)
diff --git a/paddle/fluid/operators/graph_khop_sampler_op.cu b/paddle/fluid/operators/graph_khop_sampler_op.cu
index 7c9d1f3c921f7..28acc712e2a98 100644
--- a/paddle/fluid/operators/graph_khop_sampler_op.cu
+++ b/paddle/fluid/operators/graph_khop_sampler_op.cu
@@ -29,7 +29,7 @@ limitations under the License. */
 
 #include <ostream>
 
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #include <hiprand_kernel.h>
 #elif defined(PADDLE_WITH_MUSA)
@@ -135,7 +135,7 @@ __global__ void GraphSampleNeighborsCUDAKernel(const uint64_t rand_seed,
 #endif
 
       for (int idx = k + threadIdx.x; idx < deg; idx += WARP_SIZE) {
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
         const int num = hiprand(&rng) % (idx + 1);
 #elif defined(PADDLE_WITH_MUSA)
         const int num = murand(&rng) % (idx + 1);
diff --git a/paddle/fluid/operators/lookup_table_v2_op.cu b/paddle/fluid/operators/lookup_table_v2_op.cu
index c627b1cf89dcd..b1282585bda6e 100644
--- a/paddle/fluid/operators/lookup_table_v2_op.cu
+++ b/paddle/fluid/operators/lookup_table_v2_op.cu
@@ -218,7 +218,7 @@ struct LookupTableV2GradCUDAFunctor {
       const auto *ids = ids_t_->template data<IdT>();
       T *d_table = d_table_t->mutable_data<T>(context_.GetPlace());
 
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx.stream()));
 #elif defined(PADDLE_WITH_MUSA)
diff --git a/paddle/fluid/operators/margin_cross_entropy_op.cu b/paddle/fluid/operators/margin_cross_entropy_op.cu
index 40231fd4bf2c4..4829c8f6c46c9 100644
--- a/paddle/fluid/operators/margin_cross_entropy_op.cu
+++ b/paddle/fluid/operators/margin_cross_entropy_op.cu
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 // old op include, fluid should be removed
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
-#if defined(PADDLE_WITH_MUSA)
+#elif defined(PADDLE_WITH_MUSA)
 #include <cub/cub.cuh>
 #else
 #include <cub/cub.cuh>
diff --git a/paddle/fluid/operators/math/bert_encoder_functor.h b/paddle/fluid/operators/math/bert_encoder_functor.h
index e5adc97fa7890..8e5438fcd8036 100644
--- a/paddle/fluid/operators/math/bert_encoder_functor.h
+++ b/paddle/fluid/operators/math/bert_encoder_functor.h
@@ -17,14 +17,10 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
 #include <cuda_runtime.h>
-
-#include <cub/cub.cuh>  // NOLINT
 #endif
 #ifdef PADDLE_WITH_MUSA
 #include <musa.h>
 #include <musa_runtime.h>
-
-#include <cub/cub.cuh>  // NOLINT
 #endif
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
diff --git a/paddle/fluid/operators/math/inclusive_scan.h b/paddle/fluid/operators/math/inclusive_scan.h
index 2718a0079ed19..0d8049023d2cd 100644
--- a/paddle/fluid/operators/math/inclusive_scan.h
+++ b/paddle/fluid/operators/math/inclusive_scan.h
@@ -18,7 +18,7 @@
 #include "cub/cub.cuh"
 #endif
 #ifdef __MUSACC__
-#include "cub/cub.cuh"
+#include <cub/cub.cuh>
 #endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
diff --git a/paddle/fluid/operators/math/sample_prob.cu b/paddle/fluid/operators/math/sample_prob.cu
index 4aa38e7441917..53819488ac02b 100644
--- a/paddle/fluid/operators/math/sample_prob.cu
+++ b/paddle/fluid/operators/math/sample_prob.cu
@@ -155,7 +155,7 @@ void GPUSampleWithProb<T>::operator()(const phi::GPUContext& context,
   int num_tries = UniqSampler<T>(sampler, num_samples, s_data);
   VLOG(1) << "num_tries: " << num_tries;
 
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
   PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpy(samples_data + num_true,
                                        s_data,
                                        sizeof(int64_t) * num_samples,
diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
index ba520f026bf7a..299c2ee0dd1bb 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
@@ -56,7 +56,7 @@ using phi::funcs::ToVector;
 template <typename T>
 static void FillZeroWithPtr(T *x, size_t n, gpuStream_t stream) {
   static_assert(!std::is_same<T, void>::value, "T cannot be void.");
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
   PADDLE_ENFORCE_GPU_SUCCESS(hipMemsetAsync(x, 0, n * sizeof(T), stream));
 #elif defined(PADDLE_WITH_MUSA)
   PADDLE_ENFORCE_GPU_SUCCESS(musaMemsetAsync(x, 0, n * sizeof(T), stream));
@@ -257,7 +257,7 @@ static void LogParamAndTrustRatioDivSquareNorm(
 static bool IsFinite(const phi::GPUContext &dev_ctx, const float *ptr) {
   auto stream = dev_ctx.stream();
   float cpu_value;
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
   PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(
       &cpu_value, ptr, sizeof(float), hipMemcpyDeviceToHost, stream));
   PADDLE_ENFORCE_GPU_SUCCESS(hipStreamSynchronize(stream));
@@ -1140,7 +1140,7 @@ static std::string GetMinMaxStr(const T *x, size_t n, const phi::Place &place) {
                     stream,
                     &cub_buffer);
     T ret_cpu[2];
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(
         &ret_cpu[0], ret, 2 * sizeof(T), hipMemcpyDeviceToHost, stream));
     PADDLE_ENFORCE_GPU_SUCCESS(hipStreamSynchronize(stream));
@@ -1198,7 +1198,7 @@ static bool HasNanInf(const phi::GPUContext &dev_ctx, const T *x, int numel) {
                   dev_ctx.stream(),
                   &buffer);
   bool flag;
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
   PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(&flag,
                                             out.Get<bool>(),
                                             sizeof(flag),
diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc
index 8255acecb3707..b1f7576d3a3ee 100644
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -192,7 +192,7 @@ void BufferedReader::ReadAsync(size_t i) {
         // cuda[i].mutable_data() is called, since some ops release
         // cuda memory immediately without waiting cuda kernel ends
         platform::SetDeviceId(place_.device);
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
         PADDLE_ENFORCE_GPU_SUCCESS(
             hipEventRecord(events_[i].get(), compute_stream_));
         PADDLE_ENFORCE_GPU_SUCCESS(
diff --git a/paddle/fluid/operators/sync_batch_norm_op.cu b/paddle/fluid/operators/sync_batch_norm_op.cu
index fda42c80fbbf6..1b24ea8276e24 100644
--- a/paddle/fluid/operators/sync_batch_norm_op.cu
+++ b/paddle/fluid/operators/sync_batch_norm_op.cu
@@ -302,7 +302,7 @@ void SyncBatchNormCooGradKernel(
 }  // namespace sparse
 }  // namespace phi
 
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
 PD_REGISTER_KERNEL(sync_batch_norm,
                    GPU,
                    ALL_LAYOUT,
@@ -382,7 +382,7 @@ PD_REGISTER_KERNEL(sync_batch_norm,
 #endif
 #endif
 
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
 PD_REGISTER_KERNEL(sync_batch_norm_grad,
                    GPU,
                    ALL_LAYOUT,
@@ -427,7 +427,7 @@ PD_REGISTER_KERNEL(sync_batch_norm_grad,
 #endif
 #endif
 
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
 PD_REGISTER_KERNEL(sync_batch_norm_coo,
                    GPU,
                    ALL_LAYOUT,
@@ -450,7 +450,7 @@ PD_REGISTER_KERNEL(sync_batch_norm_coo,
                    phi::dtype::float16) {}
 #endif
 
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
 PD_REGISTER_KERNEL(sync_batch_norm_coo_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/fluid/platform/collective_helper.cc b/paddle/fluid/platform/collective_helper.cc
index 941cd49cd361d..a6c2b9d61dd2b 100644
--- a/paddle/fluid/platform/collective_helper.cc
+++ b/paddle/fluid/platform/collective_helper.cc
@@ -172,7 +172,7 @@ void NCCLCommContext::CreateNCCLCommMultiTrainer(
   {
     PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclGroupStart());
     for (int i = 0; i < kDevices; i++) {
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(hipSetDevice(i));
 #elif defined(PADDLE_WITH_MUSA)
       PADDLE_ENFORCE_GPU_SUCCESS(musaSetDevice(i));
diff --git a/paddle/fluid/platform/device/gpu/gpu_helper.h b/paddle/fluid/platform/device/gpu/gpu_helper.h
index ac096b94bed84..f94f5d55b7eee 100644
--- a/paddle/fluid/platform/device/gpu/gpu_helper.h
+++ b/paddle/fluid/platform/device/gpu/gpu_helper.h
@@ -15,7 +15,7 @@
 #pragma once
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
 #include "paddle/fluid/platform/device/gpu/rocm/rocm_helper.h"
 #elif defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/device/gpu/musa/musa_helper.h"
diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc
index 2959773e14737..73fe0ca05ba73 100644
--- a/paddle/fluid/platform/device/gpu/gpu_info.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_info.cc
@@ -33,7 +33,7 @@ limitations under the License. */
 #include "paddle/fluid/string/split.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
 #include "paddle/fluid/platform/dynload/miopen.h"
 #elif defined(PADDLE_WITH_MUSA)
 //TODO(Xiaokang Shang)
@@ -212,7 +212,7 @@ class RecordedGpuMallocHelper {
 
     CUDADeviceGuard guard(dev_id_);
     gpuError_t result;
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
     if (UNLIKELY(malloc_managed_memory)) {
       result = hipMallocManaged(ptr, size);
     } else {
@@ -267,7 +267,7 @@ class RecordedGpuMallocHelper {
     // process is terminating, in which case we don't care if
     // cudaFree succeeds.
     CUDADeviceGuard guard(dev_id_);
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
     auto err = hipFree(ptr);
     if (err != hipErrorDeinitialized) {
 #elif defined(PADDLE_WITH_MUSA)
@@ -318,7 +318,7 @@ class RecordedGpuMallocHelper {
                   size_t *actual_total) {
     {
       CUDADeviceGuard guard(dev_id_);
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
       auto result = hipMemGetInfo(actual_avail, actual_total);
 #elif defined(PADDLE_WITH_MUSA)
       auto result = musaMemGetInfo(actual_avail, actual_total);
diff --git a/paddle/fluid/platform/device/gpu/gpu_types.h b/paddle/fluid/platform/device/gpu/gpu_types.h
index 1e08ca8d0b521..98d496a15ed57 100644
--- a/paddle/fluid/platform/device/gpu/gpu_types.h
+++ b/paddle/fluid/platform/device/gpu/gpu_types.h
@@ -107,7 +107,7 @@ using CUDAGraphID = unsigned long long;  // NOLINT
 
 #undef DECLARE_TYPE_FOR_GPU
 
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
 #define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \
   constexpr auto GPU_CV = ROCM_CV;
 #elif defined(PADDLE_WITH_MUSA)
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 8d26ec716504d..6cd13cb4e4f0a 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -85,7 +85,7 @@ limitations under the License. */
 #include "paddle/phi/backends/stream.h"
 
 #if !defined(PADDLE_WITH_XPU_KP) || defined(__xpu_on_host__)
-//#include "unsupported/Eigen/CXX11/Tensor"
+#include "unsupported/Eigen/CXX11/Tensor"
 #endif
 
 namespace Eigen {
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index 72771dafe62fc..51a7eb57d433c 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -42,6 +42,7 @@ limitations under the License. */
 #include <mublas.h>
 #include <mudnn.h>
 #include <mufft.h>
+// TODO(@caizhi): 
 //#include <murand.h>
 //#include <musparse.h>
 #include <thrust/system/musa/error.h>
diff --git a/paddle/fluid/platform/profiler.cu b/paddle/fluid/platform/profiler.cu
index 1f9bacecfea4b..d2fea0336f012 100644
--- a/paddle/fluid/platform/profiler.cu
+++ b/paddle/fluid/platform/profiler.cu
@@ -42,7 +42,7 @@ static void ForEachDevice(std::function<void(int)> func) {
 }
 
 void DummyKernelAndEvent() {
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
   for (int i = 0; i < 5; i++) {
     ForEachDevice([](int d) {
       platform::SetDeviceId(d);
diff --git a/paddle/fluid/pybind/parallel_executor.cc b/paddle/fluid/pybind/parallel_executor.cc
index f8d3b007e3517..d7a43b01fa7c1 100644
--- a/paddle/fluid/pybind/parallel_executor.cc
+++ b/paddle/fluid/pybind/parallel_executor.cc
@@ -130,7 +130,7 @@ limitations under the License. */
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
 #endif
-#ifdef PADDLE_WITH_CUDA
+#if !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h"
 #endif
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
diff --git a/paddle/fluid/pybind/place.cc b/paddle/fluid/pybind/place.cc
index 6cbdb5c29da02..6a87075a1a6e3 100644
--- a/paddle/fluid/pybind/place.cc
+++ b/paddle/fluid/pybind/place.cc
@@ -126,7 +126,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/reader_py.h"
 #include "paddle/fluid/pybind/tensor_py.h"
 #include "paddle/fluid/string/to_string.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
 #endif
@@ -318,7 +318,7 @@ void BindPlace(pybind11::module &m) {  // NOLINT
   cudaplace
       .def("__init__",
            [](platform::CUDAPlace &self, int dev_id) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
              if (UNLIKELY(dev_id < 0)) {
                LOG(ERROR) << string::Sprintf(
                    "Invalid CUDAPlace(%d), device id must be 0 or "
@@ -357,7 +357,7 @@ void BindPlace(pybind11::module &m) {  // NOLINT
              std::exit(-1);
 #endif
            })
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       .def("get_device_id",
            [](const platform::CUDAPlace &self) { return self.GetDeviceId(); })
       .def("_type", &PlaceIndex<platform::CUDAPlace>)
@@ -372,7 +372,7 @@ void BindPlace(pybind11::module &m) {  // NOLINT
 #endif
       .def("__repr__", string::to_string<const platform::CUDAPlace &>)
       .def("__str__", string::to_string<const platform::CUDAPlace &>);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool {
   // Only GPUs with Compute Capability >= 53 support float16
 #if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)

From 54ce8d6b2522a7f882d6c6cf95e05d83cebb60a9 Mon Sep 17 00:00:00 2001
From: "zhi.cai" <zhi.cai@mthreads.com>
Date: Tue, 1 Aug 2023 20:18:31 +0800
Subject: [PATCH 17/55] [MTAI] feat(build): fix some compiling bug

---
 .../phi/kernels/cpu/activation_grad_kernel.cc |   2 +-
 paddle/phi/kernels/cpu/activation_kernel.cc   |  32 +--
 paddle/phi/kernels/cpu/cast_grad_kernel.cc    |   6 +-
 paddle/phi/kernels/funcs/CMakeLists.txt       |   2 +-
 paddle/phi/kernels/funcs/cross_entropy.cu     |   4 +-
 paddle/phi/kernels/funcs/eigen/CMakeLists.txt |   1 +
 paddle/phi/kernels/funcs/eigen/erf.cc         |   6 +-
 paddle/phi/kernels/funcs/eigen/extensions.h   |   1 -
 paddle/phi/kernels/funcs/eigen/pad.cu         |   4 +-
 .../kernels/funcs/gather_scatter_functor.cu   |   1 +
 .../kernels/funcs/gather_scatter_functor.h    |   2 +
 paddle/phi/kernels/funcs/im2col.cu            |  13 +-
 paddle/phi/kernels/funcs/math_function.cu     |   2 +
 paddle/phi/kernels/funcs/scatter.cu.h         |   2 +
 paddle/phi/kernels/funcs/segment_pooling.cu   |   6 +
 paddle/phi/kernels/funcs/softmax.cu           |   2 +
 .../phi/kernels/gpu/batch_norm_grad_kernel.cu |   8 -
 paddle/phi/kernels/gpu/batch_norm_kernel.cu   | 246 +++++++++---------
 paddle/phi/kernels/impl/matmul_kernel_impl.h  |  18 +-
 .../strings/gpu/strings_lower_upper_kernel.cu |   3 +-
 20 files changed, 187 insertions(+), 174 deletions(-)

diff --git a/paddle/phi/kernels/cpu/activation_grad_kernel.cc b/paddle/phi/kernels/cpu/activation_grad_kernel.cc
index d47c98608c91f..7b9074ffa92f3 100644
--- a/paddle/phi/kernels/cpu/activation_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/activation_grad_kernel.cc
@@ -257,7 +257,7 @@ PD_REGISTER_KERNEL(
 
 #define PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(name, func) \
   PD_REGISTER_KERNEL(                                         \
-      name, CPU, ALL_LAYOUT, phi::func, float, double) {}
+      name, CPU, ALL_LAYOUT, phi::func, float, double, phi::dtype::float16) {}
 
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(sin_grad, SinGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(cos_grad, CosGradKernel)
diff --git a/paddle/phi/kernels/cpu/activation_kernel.cc b/paddle/phi/kernels/cpu/activation_kernel.cc
index 62ae48766057c..046cee5857808 100644
--- a/paddle/phi/kernels/cpu/activation_kernel.cc
+++ b/paddle/phi/kernels/cpu/activation_kernel.cc
@@ -191,8 +191,8 @@ PD_REGISTER_KERNEL(exp,
                    float,
                    double,
                    int,
-                   int64_t) {}
-                   //phi::dtype::float16) {}
+                   int64_t,
+                   phi::dtype::float16) {}
 
 PD_REGISTER_KERNEL(expm1,
                    CPU,
@@ -201,8 +201,8 @@ PD_REGISTER_KERNEL(expm1,
                    float,
                    double,
                    int,
-                   int64_t) {}
-                   //phi::dtype::float16) {}
+                   int64_t,
+                   phi::dtype::float16) {}
 
 PD_REGISTER_KERNEL(logit, CPU, ALL_LAYOUT, phi::LogitKernel, float, double) {}
 PD_REGISTER_KERNEL(
@@ -220,9 +220,9 @@ PD_REGISTER_KERNEL(log,
                    float,
                    double,
                    int,
-                   int64_t) {}
-                   //phi::dtype::float16,
-                   //phi::dtype::bfloat16) {}
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 PD_REGISTER_KERNEL(log2,
                    CPU,
                    ALL_LAYOUT,
@@ -230,9 +230,9 @@ PD_REGISTER_KERNEL(log2,
                    float,
                    double,
                    int,
-                   int64_t) {}
-                   //phi::dtype::float16,
-                   //phi::dtype::bfloat16) {}
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 PD_REGISTER_KERNEL(log10,
                    CPU,
                    ALL_LAYOUT,
@@ -240,9 +240,9 @@ PD_REGISTER_KERNEL(log10,
                    float,
                    double,
                    int,
-                   int64_t) {}
-                   //phi::dtype::float16,
-                   //phi::dtype::bfloat16) {}
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 PD_REGISTER_KERNEL(log1p,
                    CPU,
                    ALL_LAYOUT,
@@ -250,9 +250,9 @@ PD_REGISTER_KERNEL(log1p,
                    float,
                    double,
                    int,
-                   int64_t) {}
-                   //phi::dtype::float16,
-                   //phi::dtype::bfloat16) {}
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 
 PD_REGISTER_ACTIVATION_KERNEL(hardswish, HardSwishKernel)
 PD_REGISTER_ACTIVATION_KERNEL(round, RoundKernel)
diff --git a/paddle/phi/kernels/cpu/cast_grad_kernel.cc b/paddle/phi/kernels/cpu/cast_grad_kernel.cc
index fad74ef9e7ce9..403caf997dbf7 100644
--- a/paddle/phi/kernels/cpu/cast_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/cast_grad_kernel.cc
@@ -25,9 +25,9 @@ void CastGradKernel(const Context& dev_ctx,
                     const DenseTensor& x,
                     const DenseTensor& out_grad,
                     DenseTensor* x_grad) {
-  //PD_VISIT_ALL_TYPES(x.dtype(), "CastKernelImpl", ([&] {
-  //                     CastKernelImpl<T, data_t>(dev_ctx, out_grad, x_grad);
-  //                   }));
+  PD_VISIT_ALL_TYPES(x.dtype(), "CastKernelImpl", ([&] {
+                       CastKernelImpl<T, data_t>(dev_ctx, out_grad, x_grad);
+                     }));
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt
index a7ad03dad4e80..2be353d56c22e 100644
--- a/paddle/phi/kernels/funcs/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/CMakeLists.txt
@@ -8,6 +8,7 @@ file(
   GLOB func_cc_srcs
   RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
   "*.cc")
+# TODO(@caizhi): enable compiling all cu kernels
 #if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
 #  file(
 #    GLOB func_cu_srcs
@@ -28,4 +29,3 @@ if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
 endif()
 
 collect_srcs(kernels_srcs SRCS ${func_cc_srcs} ${func_cu_srcs})
-#collect_srcs(kernels_srcs SRCS ${func_cc_srcs})
diff --git a/paddle/phi/kernels/funcs/cross_entropy.cu b/paddle/phi/kernels/funcs/cross_entropy.cu
index 00e885eeac5a1..add838106bfe8 100644
--- a/paddle/phi/kernels/funcs/cross_entropy.cu
+++ b/paddle/phi/kernels/funcs/cross_entropy.cu
@@ -154,11 +154,9 @@ void CrossEntropyFunctor<DeviceContext, T>::operator()(
 template class CrossEntropyFunctor<phi::GPUContext, float>;
 template class CrossEntropyFunctor<phi::GPUContext, double>;
 template class CrossEntropyFunctor<phi::GPUContext, phi::dtype::float16>;
-#if defined(PADDLE_WITH_CUDA)
-#if CUDNN_VERSION_MIN(8, 1, 0)
+#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(8, 1, 0)
 template class CrossEntropyFunctor<phi::GPUContext, phi::dtype::bfloat16>;
 #endif
-#endif
 
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/eigen/CMakeLists.txt b/paddle/phi/kernels/funcs/eigen/CMakeLists.txt
index a03783c90b9be..a19ecee91b096 100644
--- a/paddle/phi/kernels/funcs/eigen/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/eigen/CMakeLists.txt
@@ -2,6 +2,7 @@ file(
   GLOB EIGEN_CC_SOURCES
   RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
   "*.cc")
+# TODO(@caizhi): compile all cu files
 file(
   GLOB EIGEN_CU_SOURCES
   RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
diff --git a/paddle/phi/kernels/funcs/eigen/erf.cc b/paddle/phi/kernels/funcs/eigen/erf.cc
index d24f2654eee24..63d3bba30f99a 100644
--- a/paddle/phi/kernels/funcs/eigen/erf.cc
+++ b/paddle/phi/kernels/funcs/eigen/erf.cc
@@ -28,7 +28,7 @@ struct EigenErf<Eigen::DefaultDevice, T> {
   static void Eval(const Eigen::DefaultDevice& dev,
                    OutType out,
                    const InType& in) {
-    //out.device(dev) = in.erf();
+    out.device(dev) = in.erf();
   }
 };
 
@@ -42,8 +42,8 @@ struct EigenErfGrad<Eigen::DefaultDevice, T> {
                    OutType din,
                    const InType& in,
                    const InType& dout) {
-    //din.device(dev) =
-    //    dout * static_cast<T>(M_2_SQRTPI) * (-(in.square())).exp();
+    din.device(dev) =
+        dout * static_cast<T>(M_2_SQRTPI) * (-(in.square())).exp();
   }
 };
 
diff --git a/paddle/phi/kernels/funcs/eigen/extensions.h b/paddle/phi/kernels/funcs/eigen/extensions.h
index d4b182c4d4fce..c724564417b19 100644
--- a/paddle/phi/kernels/funcs/eigen/extensions.h
+++ b/paddle/phi/kernels/funcs/eigen/extensions.h
@@ -435,7 +435,6 @@ HOSTDEVICE inline float16 maxi(const float16& a, const float16& b) {
 }
 
 }  // namespace numext
-
 }  // namespace Eigen
 
 #endif  // __xpu__
diff --git a/paddle/phi/kernels/funcs/eigen/pad.cu b/paddle/phi/kernels/funcs/eigen/pad.cu
index 42ac4e51de261..c4a3dd9ecc4f5 100644
--- a/paddle/phi/kernels/funcs/eigen/pad.cu
+++ b/paddle/phi/kernels/funcs/eigen/pad.cu
@@ -39,7 +39,7 @@ struct EigenPad<Eigen::GpuDevice, T, Rank> {
                    const InType& in,
                    const Array& padding,
                    const T value) {
-    //out.device(dev) = in.pad(padding, value);
+    out.device(dev) = in.pad(padding, value);
   }
 
   static void Eval32(const Eigen::GpuDevice& dev,
@@ -47,7 +47,7 @@ struct EigenPad<Eigen::GpuDevice, T, Rank> {
                      const InType32BitIndex& in,
                      const Array32Bit& padding,
                      const T value) {
-    //out.device(dev) = in.pad(padding, value);
+    out.device(dev) = in.pad(padding, value);
   }
 };
 
diff --git a/paddle/phi/kernels/funcs/gather_scatter_functor.cu b/paddle/phi/kernels/funcs/gather_scatter_functor.cu
index 641eaaf850033..edc647a968f63 100644
--- a/paddle/phi/kernels/funcs/gather_scatter_functor.cu
+++ b/paddle/phi/kernels/funcs/gather_scatter_functor.cu
@@ -34,6 +34,7 @@ class ReduceAdd {
       typename tensor_t,
       std::enable_if_t<!std::is_same<tensor_t, uint8_t>::value>* = nullptr>
   __device__ void operator()(tensor_t* self_data, tensor_t* src_data) const {
+    // TODO(@caizhi): enable cudaAtomicAdd
     //phi::CudaAtomicAdd(self_data, *src_data);
   }
   template <typename tensor_t,
diff --git a/paddle/phi/kernels/funcs/gather_scatter_functor.h b/paddle/phi/kernels/funcs/gather_scatter_functor.h
index 3a1c5458bf623..7f06453299456 100644
--- a/paddle/phi/kernels/funcs/gather_scatter_functor.h
+++ b/paddle/phi/kernels/funcs/gather_scatter_functor.h
@@ -27,6 +27,8 @@ namespace funcs {
       Instantiate_Template_Function_index_t(                                 \
           func, double) Instantiate_Template_Function_index_t(func, int64_t) \
           Instantiate_Template_Function_index_t(func, phi::dtype::float16)   \
+	      Instantiate_Template_Function_index_t(func,                    \
+                                                    phi::dtype::bfloat16)    \
                   Instantiate_Template_Function_index_t(func, unsigned char)
 
 #define Instantiate_Template_Function_index_t(func, tensor_t)          \
diff --git a/paddle/phi/kernels/funcs/im2col.cu b/paddle/phi/kernels/funcs/im2col.cu
index eabaab11bb11f..a14d9886bb821 100644
--- a/paddle/phi/kernels/funcs/im2col.cu
+++ b/paddle/phi/kernels/funcs/im2col.cu
@@ -472,6 +472,7 @@ __global__ void col2imOCF(const T* col_data,
 
         if (height_offset >= 0 && height_offset < im_height &&
             width_offset >= 0 && width_offset < im_width) {
+	  // TODO(@caizhi): compile CudaAtomicAdd
           //phi::CudaAtomicAdd(im_data + im_offset, col_data[col_offset]);
         }
       }
@@ -576,9 +577,9 @@ template class Im2ColFunctor<phi::funcs::ColFormat::kOCF,
 template class Im2ColFunctor<phi::funcs::ColFormat::kOCF,
                              phi::GPUContext,
                              phi::dtype::float16>;
-//template class Im2ColFunctor<phi::funcs::ColFormat::kOCF,
-//                             phi::GPUContext,
-//                             phi::dtype::bfloat16>;
+template class Im2ColFunctor<phi::funcs::ColFormat::kOCF,
+                             phi::GPUContext,
+                             phi::dtype::bfloat16>;
 template class Col2ImFunctor<phi::funcs::ColFormat::kOCF,
                              phi::GPUContext,
                              float>;
@@ -588,9 +589,9 @@ template class Col2ImFunctor<phi::funcs::ColFormat::kOCF,
 template class Col2ImFunctor<phi::funcs::ColFormat::kOCF,
                              phi::GPUContext,
                              phi::dtype::float16>;
-//template class Col2ImFunctor<phi::funcs::ColFormat::kOCF,
-//                             phi::GPUContext,
-//                             phi::dtype::bfloat16>;
+template class Col2ImFunctor<phi::funcs::ColFormat::kOCF,
+                             phi::GPUContext,
+                             phi::dtype::bfloat16>;
 
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/math_function.cu b/paddle/phi/kernels/funcs/math_function.cu
index 0b94c14d8bbbf..8dac85bb3c585 100644
--- a/paddle/phi/kernels/funcs/math_function.cu
+++ b/paddle/phi/kernels/funcs/math_function.cu
@@ -430,6 +430,7 @@ void ColwiseSum<phi::GPUContext, double>::operator()(
 
   SetConstant<phi::GPUContext, double> set;
   set(context, &one, static_cast<double>(1.0));
+  // TODO(@caizhi): enable blas modules
   //phi::funcs::GetBlas<phi::GPUContext, double>(context).GEMV(
   //    true,
   //    static_cast<int>(in_dims[0]),
@@ -468,6 +469,7 @@ void RowwiseSum<phi::GPUContext, double>::operator()(
 
   SetConstant<phi::GPUContext, double> set;
   set(context, &one, static_cast<double>(1.0));
+  // TODO(@caizhi): enable blas modules
   //phi::funcs::GetBlas<phi::GPUContext, double>(context).GEMV(
   //    true,
   //    static_cast<int>(in_dims[1]),
diff --git a/paddle/phi/kernels/funcs/scatter.cu.h b/paddle/phi/kernels/funcs/scatter.cu.h
index 2e9f551585e50..9194b6dcc24d6 100644
--- a/paddle/phi/kernels/funcs/scatter.cu.h
+++ b/paddle/phi/kernels/funcs/scatter.cu.h
@@ -76,6 +76,7 @@ __global__ void ScatterCUDAKernel(const T* params,
     if (overwrite) {
       *(output + out_i) = *(params + i);
     } else {
+      // TODO(@caizhi): enable compiling cudaAtomicAdd
       //phi::CudaAtomicAdd(output + out_i, *(params + i));
     }
   }
@@ -110,6 +111,7 @@ __global__ void ScatterNdCUDAKernel(const T* update,
       temp *= output_dims[j];
     }
     int64_t output_i = gather_i + slice_i;
+    // TODO(@caizhi): enable compiling cudaAtomicAdd
     //phi::CudaAtomicAdd(output + output_i, *(update + i));
   }
 }
diff --git a/paddle/phi/kernels/funcs/segment_pooling.cu b/paddle/phi/kernels/funcs/segment_pooling.cu
index dbc4dbc31db84..ef13af5b4eff5 100644
--- a/paddle/phi/kernels/funcs/segment_pooling.cu
+++ b/paddle/phi/kernels/funcs/segment_pooling.cu
@@ -61,6 +61,7 @@ __global__ void SegmentSumIdsKernel(const Index* segment_ids,
         }
         if (j > 0) {
           if (last_segment_id == first_segment_id) {
+	    // TODO(@caizhi): enable compiling CudaAtomicAdd
             //phi::CudaAtomicAdd(summed_ids + last_segment_id, sum);
           } else {
             *(summed_ids + last_segment_id) = sum;
@@ -112,6 +113,7 @@ __global__ void SegmentMeanKernel(const Index* segment_ids,
               last_segment_id * inner_dim_size + segment_offset;
 
           if (last_segment_id == first_segment_id) {
+	    // TODO(@caizhi): enable compiling CudaAtomicAdd
             //phi::CudaAtomicAdd(output + output_index,
             //                   sum / *(summed_ids + last_segment_id));
           } else {
@@ -124,6 +126,7 @@ __global__ void SegmentMeanKernel(const Index* segment_ids,
       last_segment_id = current_segment_id;
     }
     Index output_index = last_segment_id * inner_dim_size + segment_offset;
+    // TODO(@caizhi): enable compiling CudaAtomicAdd
     //phi::CudaAtomicAdd(output + output_index,
     //                   sum / *(summed_ids + last_segment_id));
   }
@@ -216,6 +219,7 @@ class MaxPool {
   DEVICE inline T initial() { return static_cast<T>(-FLT_MAX); }
   DEVICE inline void compute(const T& x, T* y) { *y = *y > x ? *y : x; }
   DEVICE inline T atomic(T* address, const T val) {
+    // TODO(@caizhi): enable compiling CudaAtomicAdd
     //return phi::CudaAtomicMax(address, val);
     return val;
   }
@@ -227,6 +231,7 @@ class MinPool {
   DEVICE inline T initial() { return static_cast<T>(FLT_MAX); }
   DEVICE inline void compute(const T& x, T* y) { *y = *y < x ? *y : x; }
   DEVICE inline T atomic(T* address, const T val) {
+    // TODO(@caizhi): enable compiling CudaAtomicAdd
     //return phi::CudaAtomicMin(address, val);
     return val;
   }
@@ -238,6 +243,7 @@ class SumPool {
   DEVICE inline T initial() { return static_cast<T>(0); }
   DEVICE inline void compute(const T& x, T* y) { *y = *y + x; }
   DEVICE inline T atomic(T* address, const T val) {
+    // TODO(@caizhi): enable compiling CudaAtomicAdd
     //return phi::CudaAtomicAdd(address, val);
     return val;
   }
diff --git a/paddle/phi/kernels/funcs/softmax.cu b/paddle/phi/kernels/funcs/softmax.cu
index 24c1d8827cb49..9e7cf84273b04 100644
--- a/paddle/phi/kernels/funcs/softmax.cu
+++ b/paddle/phi/kernels/funcs/softmax.cu
@@ -21,6 +21,7 @@ limitations under the License. */
 
 namespace phi {
 namespace funcs {
+// TODO(@caizhi): enable it
 #if 0
 using ScopedTensorDescriptor = phi::backends::gpu::ScopedTensorDescriptor;
 using DataLayout = phi::backends::gpu::DataLayout;
@@ -147,6 +148,7 @@ template class SoftmaxCUDNNFunctor<float, phi::GPUContext>;
 template class SoftmaxCUDNNFunctor<phi::dtype::float16, phi::GPUContext>;
 template class SoftmaxGradCUDNNFunctor<float, phi::GPUContext>;
 template class SoftmaxGradCUDNNFunctor<phi::dtype::float16, phi::GPUContext>;
+// TODO(@caizhi): enable it
 //#if CUDNN_VERSION_MIN(8, 1, 0)
 //template class SoftmaxCUDNNFunctor<phi::dtype::bfloat16, phi::GPUContext>;
 //template class SoftmaxGradCUDNNFunctor<phi::dtype::bfloat16, phi::GPUContext>;
diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
index 43b7b4635378f..19930eb28add5 100644
--- a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
@@ -651,14 +651,6 @@ void BatchNormGradRawKernel(const Context &ctx,
 // PADDLE_ENFORCE_GPU_SUCCESS(
 //     platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
 #elif defined(PADDLE_WITH_MUSA)
-    mudnnTensorDescriptor_t data_desc_;
-    mudnnTensorDescriptor_t bn_param_desc_;
-    mudnnBatchNormMode_t mode_;
-
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::mudnnCreateTensorDescriptor(&data_desc_));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::mudnnCreateTensorDescriptor(&bn_param_desc_));
 #else
     cudnnTensorDescriptor_t data_desc_;
     cudnnTensorDescriptor_t bn_param_desc_;
diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
index 7947477cab861..d6a1fb761719e 100644
--- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -597,14 +597,6 @@ void BatchNormKernel(const Context &ctx,
 // PADDLE_ENFORCE_GPU_SUCCESS(
 //     platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
 #elif defined(PADDLE_WITH_MUSA)
-  mudnnTensorDescriptor_t data_desc_;
-  mudnnTensorDescriptor_t bn_param_desc_;
-  mudnnBatchNormMode_t mode_;
-
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::mudnnCreateTensorDescriptor(&data_desc_));
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::mudnnCreateTensorDescriptor(&bn_param_desc_));
 #else
   cudnnTensorDescriptor_t data_desc_;
   cudnnTensorDescriptor_t bn_param_desc_;
@@ -1105,102 +1097,102 @@ void BatchNormKernel(const Context &ctx,
                   compute_inv_var_tensor.data<BatchNormParamType<T>>());
         }
       } else {
-//#if CUDNN_VERSION_MIN(7, 4, 1)
-//        size_t workspace_size = 0;
-//        size_t reserve_space_size = 0;
-//        void *reserve_space_ptr = nullptr;
-//        void *workspace_ptr = nullptr;
-//        DenseTensor workspace_tensor;
-//        DenseTensor reserve_space_tensor;
-//        // Create reserve space and workspace for batch norm.
-//        // Create tensor for each batchnorm op, it will be used in the
-//        // backward. Thus this tensor shouldn't be temp.
-//        // auto *reserve_space = ctx.Output<phi::DenseTensor>("ReserveSpace");
-//        if (reserve_space == nullptr) {
-//          reserve_space = &reserve_space_tensor;
-//        }
-//        PADDLE_ENFORCE_NOT_NULL(
-//            reserve_space,
-//            phi::errors::NotFound(
-//                "The argument ReserveSpace of batch_norm op is not found."));
-//        // --------------- cudnn batchnorm workspace ---------------
-//        PADDLE_ENFORCE_GPU_SUCCESS(
-//            phi::dynload::
-//                cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
-//                    /*handle=*/handle,
-//                    /*mode=*/mode_,
-//                    /*bnIps=*/CUDNN_BATCHNORM_OPS_BN,
-//                    /*xDesc=*/data_desc_,
-//                    /*zDesc=*/nullptr,
-//                    /*yDesc=*/data_desc_,
-//                    /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
-//                    /*activationDesc=*/nullptr,
-//                    /*sizeInBytes=*/&workspace_size));
-//
-//        // -------------- cudnn batchnorm reserve space --------------
-//        PADDLE_ENFORCE_GPU_SUCCESS(
-//            phi::dynload::cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
-//                /*handle=*/handle,
-//                /*mode=*/mode_,
-//                /*bnOps=*/CUDNN_BATCHNORM_OPS_BN,
-//                /*activationDesc=*/nullptr,
-//                /*xDesc=*/data_desc_,
-//                /*sizeInBytes=*/&reserve_space_size));
-//
-//        reserve_space->Resize({static_cast<int64_t>(reserve_space_size)});
-//        reserve_space_ptr =
-//            static_cast<void *>(ctx.template Alloc<uint8_t>(reserve_space));
-//        workspace_tensor.Resize({static_cast<int64_t>(workspace_size)});
-//        workspace_ptr =
-//            static_cast<void *>(ctx.template Alloc<uint8_t>(&workspace_tensor));
-//        PADDLE_ENFORCE_GPU_SUCCESS(
-//            phi::dynload::cudnnBatchNormalizationForwardTrainingEx(
-//                handle,
-//                mode_,
-//                CUDNN_BATCHNORM_OPS_BN,
-//                CudnnDataType<T>::kOne(),
-//                CudnnDataType<T>::kZero(),
-//                data_desc_,
-//                transformed_x.template data<T>(),
-//                nullptr,
-//                nullptr,
-//                data_desc_,
-//                transformed_y.template data<T>(),
-//                bn_param_desc_,
-//                scale.template data<BatchNormParamType<T>>(),
-//                bias.template data<BatchNormParamType<T>>(),
-//                this_factor,
-//                ctx.template Alloc<BatchNormParamType<T>>(mean_out),
-//                ctx.template Alloc<BatchNormParamType<T>>(variance_out),
-//                epsilon,
-//                ctx.template Alloc<BatchNormParamType<T>>(saved_mean),
-//                ctx.template Alloc<BatchNormParamType<T>>(saved_variance),
-//                nullptr,
-//                workspace_ptr,
-//                workspace_size,
-//                reserve_space_ptr,
-//                reserve_space_size));
-//#else
-//        PADDLE_ENFORCE_GPU_SUCCESS(
-//            phi::dynload::cudnnBatchNormalizationForwardTraining(
-//                handle,
-//                mode_,
-//                CudnnDataType<T>::kOne(),
-//                CudnnDataType<T>::kZero(),
-//                data_desc_,
-//                transformed_x.template data<T>(),
-//                data_desc_,
-//                ctx.template Alloc<T>(&transformed_y),
-//                bn_param_desc_,
-//                scale.template data<BatchNormParamType<T>>(),
-//                bias.template data<BatchNormParamType<T>>(),
-//                this_factor,
-//                ctx.template Alloc<BatchNormParamType<T>>(mean_out),
-//                ctx.template Alloc<BatchNormParamType<T>>(variance_out),
-//                epsilon,
-//                ctx.template Alloc<BatchNormParamType<T>>(saved_mean),
-//                ctx.template Alloc<BatchNormParamType<T>>(saved_variance)));
-//#endif  // CUDNN_VERSION_MIN(7, 4, 1)
+#if CUDNN_VERSION_MIN(7, 4, 1)
+        size_t workspace_size = 0;
+        size_t reserve_space_size = 0;
+        void *reserve_space_ptr = nullptr;
+        void *workspace_ptr = nullptr;
+        DenseTensor workspace_tensor;
+        DenseTensor reserve_space_tensor;
+        // Create reserve space and workspace for batch norm.
+        // Create tensor for each batchnorm op, it will be used in the
+        // backward. Thus this tensor shouldn't be temp.
+        // auto *reserve_space = ctx.Output<phi::DenseTensor>("ReserveSpace");
+        if (reserve_space == nullptr) {
+          reserve_space = &reserve_space_tensor;
+        }
+        PADDLE_ENFORCE_NOT_NULL(
+            reserve_space,
+            phi::errors::NotFound(
+                "The argument ReserveSpace of batch_norm op is not found."));
+        // --------------- cudnn batchnorm workspace ---------------
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            phi::dynload::
+                cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
+                    /*handle=*/handle,
+                    /*mode=*/mode_,
+                    /*bnIps=*/CUDNN_BATCHNORM_OPS_BN,
+                    /*xDesc=*/data_desc_,
+                    /*zDesc=*/nullptr,
+                    /*yDesc=*/data_desc_,
+                    /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
+                    /*activationDesc=*/nullptr,
+                    /*sizeInBytes=*/&workspace_size));
+
+        // -------------- cudnn batchnorm reserve space --------------
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            phi::dynload::cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
+                /*handle=*/handle,
+                /*mode=*/mode_,
+                /*bnOps=*/CUDNN_BATCHNORM_OPS_BN,
+                /*activationDesc=*/nullptr,
+                /*xDesc=*/data_desc_,
+                /*sizeInBytes=*/&reserve_space_size));
+
+        reserve_space->Resize({static_cast<int64_t>(reserve_space_size)});
+        reserve_space_ptr =
+            static_cast<void *>(ctx.template Alloc<uint8_t>(reserve_space));
+        workspace_tensor.Resize({static_cast<int64_t>(workspace_size)});
+        workspace_ptr =
+            static_cast<void *>(ctx.template Alloc<uint8_t>(&workspace_tensor));
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            phi::dynload::cudnnBatchNormalizationForwardTrainingEx(
+                handle,
+                mode_,
+                CUDNN_BATCHNORM_OPS_BN,
+                CudnnDataType<T>::kOne(),
+                CudnnDataType<T>::kZero(),
+                data_desc_,
+                transformed_x.template data<T>(),
+                nullptr,
+                nullptr,
+                data_desc_,
+                transformed_y.template data<T>(),
+                bn_param_desc_,
+                scale.template data<BatchNormParamType<T>>(),
+                bias.template data<BatchNormParamType<T>>(),
+                this_factor,
+                ctx.template Alloc<BatchNormParamType<T>>(mean_out),
+                ctx.template Alloc<BatchNormParamType<T>>(variance_out),
+                epsilon,
+                ctx.template Alloc<BatchNormParamType<T>>(saved_mean),
+                ctx.template Alloc<BatchNormParamType<T>>(saved_variance),
+                nullptr,
+                workspace_ptr,
+                workspace_size,
+                reserve_space_ptr,
+                reserve_space_size));
+#else
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            phi::dynload::cudnnBatchNormalizationForwardTraining(
+                handle,
+                mode_,
+                CudnnDataType<T>::kOne(),
+                CudnnDataType<T>::kZero(),
+                data_desc_,
+                transformed_x.template data<T>(),
+                data_desc_,
+                ctx.template Alloc<T>(&transformed_y),
+                bn_param_desc_,
+                scale.template data<BatchNormParamType<T>>(),
+                bias.template data<BatchNormParamType<T>>(),
+                this_factor,
+                ctx.template Alloc<BatchNormParamType<T>>(mean_out),
+                ctx.template Alloc<BatchNormParamType<T>>(variance_out),
+                epsilon,
+                ctx.template Alloc<BatchNormParamType<T>>(saved_mean),
+                ctx.template Alloc<BatchNormParamType<T>>(saved_variance)));
+#endif  // CUDNN_VERSION_MIN(7, 4, 1)
       }
 #endif
     }
@@ -1252,28 +1244,28 @@ PD_REGISTER_KERNEL(batch_norm,
   kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
 }
 #else
-//#if CUDNN_VERSION_MIN(8, 1, 0)
-//PD_REGISTER_KERNEL(batch_norm,
-//                   GPU,
-//                   ALL_LAYOUT,
-//                   phi::BatchNormKernel,
-//                   float,
-//                   double,
-//                   phi::dtype::bfloat16,
-//                   phi::dtype::float16) {
-//  if (kernel_key.dtype() == phi::DataType::FLOAT16 ||
-//      kernel_key.dtype() == phi::DataType::BFLOAT16) {
-//    kernel->InputAt(1).SetDataType(phi::DataType::FLOAT32);
-//    kernel->InputAt(2).SetDataType(phi::DataType::FLOAT32);
-//    kernel->InputAt(3).SetDataType(phi::DataType::FLOAT32);
-//    kernel->InputAt(4).SetDataType(phi::DataType::FLOAT32);
-//    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
-//    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
-//    kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
-//    kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
-//  }
-//}
-//#else // CUDA & MUSA
+#if CUDNN_VERSION_MIN(8, 1, 0)
+PD_REGISTER_KERNEL(batch_norm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16,
+                   phi::dtype::float16) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16 ||
+      kernel_key.dtype() == phi::DataType::BFLOAT16) {
+    kernel->InputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->InputAt(2).SetDataType(phi::DataType::FLOAT32);
+    kernel->InputAt(3).SetDataType(phi::DataType::FLOAT32);
+    kernel->InputAt(4).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
+  }
+}
+#else // CUDA & MUSA
 PD_REGISTER_KERNEL(batch_norm,
                    GPU,
                    ALL_LAYOUT,
@@ -1292,6 +1284,6 @@ PD_REGISTER_KERNEL(batch_norm,
     kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
   }
 }
-//#endif
+#endif
 
 #endif
diff --git a/paddle/phi/kernels/impl/matmul_kernel_impl.h b/paddle/phi/kernels/impl/matmul_kernel_impl.h
index 9fbbcc95fb323..c372430a556e2 100644
--- a/paddle/phi/kernels/impl/matmul_kernel_impl.h
+++ b/paddle/phi/kernels/impl/matmul_kernel_impl.h
@@ -128,6 +128,7 @@ void MatMulFunctionImplWithBlas(
     VLOG(3) << "MatMul's case 1";
     Out->Resize(phi::make_ddim({}));
     dev_ctx.template Alloc<T>(Out);
+    // TODO(@caizhi): enable it
     //blas.GEMM(CblasNoTrans,
     //          CblasTrans,
     //          1,
@@ -178,6 +179,7 @@ void MatMulFunctionImplWithBlas(
     if (trans_y) {
       const int M = Y.numel() / N;
       VLOG(3) << "MatMul's case 2";
+      // TODO(@caizhi): enable it
       //blas.GEMV(false,
       //          M,
       //          N,
@@ -191,6 +193,7 @@ void MatMulFunctionImplWithBlas(
       const int batch_size = Y.numel() / (M * N);
       if (batch_size == 1) {
         VLOG(3) << "MatMul's case 3";
+        // TODO(@caizhi): enable it
         //blas.GEMV(true,
         //          N,
         //          M,
@@ -201,6 +204,7 @@ void MatMulFunctionImplWithBlas(
         //          dev_ctx.template Alloc<T>(Out));
       } else {
         VLOG(3) << "MatMul's case 4";
+        // TODO(@caizhi): enable it
         //blas.BatchedGEMM(CblasTrans,
         //                 CblasNoTrans,
         //                 M,
@@ -259,6 +263,7 @@ void MatMulFunctionImplWithBlas(
       const int batch_size = X.numel() / (M * N);
       if (batch_size == 1) {
         VLOG(3) << "MatMul's case 5";
+        // TODO(@caizhi): enable it
         //blas.GEMV(true,
         //          N,
         //          M,
@@ -269,6 +274,7 @@ void MatMulFunctionImplWithBlas(
         //          dev_ctx.template Alloc<T>(Out));
       } else {
         VLOG(3) << "MatMul's case 6";
+        // TODO(@caizhi): enable it
         //blas.BatchedGEMM(CblasTrans,
         //                 CblasNoTrans,
         //                 M,
@@ -286,6 +292,7 @@ void MatMulFunctionImplWithBlas(
     } else {
       const int M = X.numel() / N;
       VLOG(3) << "MatMul's case 7";
+      // TODO(@caizhi): enable it
       //blas.GEMV(false,
       //          M,
       //          N,
@@ -367,6 +374,7 @@ void MatMulFunctionImplWithBlas(
   if (out_batch_size == 0) return;
   if (x_batch_size == 1 && y_batch_size == 1) {
     VLOG(3) << "MatMul's case 8";
+    // TODO(@caizhi): enable it
     //blas.GEMM(trans_x ? CblasTrans : CblasNoTrans,
     //          trans_y ? CblasTrans : CblasNoTrans,
     //          M,
@@ -380,6 +388,7 @@ void MatMulFunctionImplWithBlas(
   } else if (x_batch_size == 1) {
     if (M == 1 && trans_y) {
       VLOG(3) << "MatMul's case 9";
+      // TODO(@caizhi): enable it
       //blas.GEMV(false,
       //          y_batch_size * N,
       //          K,
@@ -390,6 +399,7 @@ void MatMulFunctionImplWithBlas(
       //          dev_ctx.template Alloc<T>(Out));
     } else {
       VLOG(3) << "MatMul's case 10";
+      // TODO(@caizhi): enable it
       //blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
       //                 trans_y ? CblasTrans : CblasNoTrans,
       //                 M,
@@ -407,6 +417,7 @@ void MatMulFunctionImplWithBlas(
   } else if (y_batch_size == 1) {
     if (!trans_x) {
       VLOG(3) << "MatMul's case 11";
+      // TODO(@caizhi): enable it
       //blas.GEMM(CblasNoTrans,
       //          trans_y ? CblasTrans : CblasNoTrans,
       //          x_batch_size * M,
@@ -419,6 +430,7 @@ void MatMulFunctionImplWithBlas(
       //          dev_ctx.template Alloc<T>(Out));
     } else {
       VLOG(3) << "MatMul's case 12";
+      // TODO(@caizhi): enable it
       //blas.BatchedGEMM(CblasTrans,
       //                 trans_y ? CblasTrans : CblasNoTrans,
       //                 M,
@@ -435,6 +447,7 @@ void MatMulFunctionImplWithBlas(
     }
   } else if (!is_broadcast_dims) {
     VLOG(3) << "MatMul's case 13";
+    // TODO(@caizhi): enable it
     //blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
     //                 trans_y ? CblasTrans : CblasNoTrans,
     //                 M,
@@ -467,6 +480,7 @@ void MatMulFunctionImplWithBlas(
       IndexIncreaseFromDims(batch_dim, out_broadcast_dims.data(), index.data());
     }
     VLOG(3) << "MatMul's case 14";
+    // TODO(@caizhi): enable it
     //blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
     //                 trans_y ? CblasTrans : CblasNoTrans,
     //                 M,
@@ -983,6 +997,7 @@ void MatmulKernel(const Context& ctx,
                                    " but reviced dims size is 0. "));
   const std::vector<std::int64_t> x_dims = vectorize(x.dims());
   const std::vector<std::int64_t> y_dims = vectorize(y.dims());
+  // TODO(@caizhi): enable it
   //MatMulFunction<Context, T>(
   //    ctx, x, y, x_dims, y_dims, out, transpose_x, transpose_y);
 }
@@ -1006,7 +1021,8 @@ void MatmulWithFlattenKernel(const Context& dev_ctx,
   }
 
   auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
-
+ 
+  // TODO(@caizhi): enable it
   //blas.MatMul(x_matrix, y_matrix, out);
   if (z_dim.size() != 2) {
     out->Resize(z_dim);
diff --git a/paddle/phi/kernels/strings/gpu/strings_lower_upper_kernel.cu b/paddle/phi/kernels/strings/gpu/strings_lower_upper_kernel.cu
index ffe87317d0623..832d9bbf73c0b 100644
--- a/paddle/phi/kernels/strings/gpu/strings_lower_upper_kernel.cu
+++ b/paddle/phi/kernels/strings/gpu/strings_lower_upper_kernel.cu
@@ -8,7 +8,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#if 0
+
 #include "paddle/phi/kernels/strings/strings_lower_upper_kernel.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
@@ -178,4 +178,3 @@ PD_REGISTER_KERNEL_FOR_ALL_DTYPE(
     GPU,
     ALL_LAYOUT,
     phi::strings::StringUpperKernel<phi::GPUContext>) {}
-#endif

From ba3a5ec5c3ddcc562a8e78ad93492fa19f291b6d Mon Sep 17 00:00:00 2001
From: "zhi.cai" <zhi.cai@mthreads.com>
Date: Wed, 2 Aug 2023 10:50:27 +0800
Subject: [PATCH 18/55] [MTAI] feat(build): update submodule eigen3 commit id

---
 third_party/eigen3 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/eigen3 b/third_party/eigen3
index f612df273689a..69dcad601a69a 160000
--- a/third_party/eigen3
+++ b/third_party/eigen3
@@ -1 +1 @@
-Subproject commit f612df273689a19d25b45ca4f8269463207c4fee
+Subproject commit 69dcad601a69a65c56791d46e24e395e1b1b4e05

From f68b43febdc42a7f18a2794f053d3884f57a9927 Mon Sep 17 00:00:00 2001
From: CaiZhi <zhi.cai@mthreads.com>
Date: Wed, 2 Aug 2023 17:59:41 +0800
Subject: [PATCH 19/55] [MTAI-484] feat(build): change #if defined to #ifdef
 back

---
 paddle/fluid/framework/details/op_handle_base.cc | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index 55f8677855cf1..db539c0fbdaf2 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -51,7 +51,7 @@ void OpHandleBase::InitCUDA() {
   for (auto &p : dev_ctxes_) {
     int dev_id = p.first.device;
     platform::SetDeviceId(dev_id);
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(
         hipEventCreateWithFlags(&events_[dev_id], hipEventDisableTiming));
 #elif defined(PADDLE_WITH_MUSA)
@@ -191,7 +191,7 @@ void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
   } else {
     auto stream = static_cast<phi::GPUContext *>(waited_ctx)->stream();
     for (auto &ev : events_) {
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(stream, ev.second, 0));
 #elif defined(PADDLE_WITH_MUSA)
       PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(stream, ev.second, 0));
@@ -231,7 +231,7 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
           auto stream =
               static_cast<phi::GPUContext *>(dev_ctxes_.at(place))->stream();
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
           PADDLE_ENFORCE_GPU_SUCCESS(
               hipStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
 #elif defined(PADDLE_WITH_MUSA)
@@ -287,7 +287,7 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
           auto stream = static_cast<phi::GPUContext *>(
                             dev_ctxes_.at(in_var_handle->place()))
                             ->stream();
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
           PADDLE_ENFORCE_GPU_SUCCESS(
               hipStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
 #elif defined(PADDLE_WITH_MUSA)
@@ -330,7 +330,7 @@ void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
       auto dev_id = p.first.device;
       auto *cuda_dev_ctx = static_cast<phi::GPUContext *>(p.second);
       VLOG(10) << "phi::GPUContext:" << cuda_dev_ctx << ", dev_id:" << dev_id;
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipEventRecord(events_.at(dev_id), cuda_dev_ctx->stream()));
 #elif defined(PADDLE_WITH_MUSA)

From 781d7ed4a678f05baafd6944e1046bbcb842205f Mon Sep 17 00:00:00 2001
From: Xiaokang Shang <xiaokang.shang@mthreads.com>
Date: Wed, 2 Aug 2023 20:31:46 +0800
Subject: [PATCH 20/55] replace MUSACC with MCC

---
 paddle/phi/kernels/funcs/select_impl.cu.h        | 2 +-
 paddle/phi/kernels/kps/elementwise_add_kernel.cu | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/phi/kernels/funcs/select_impl.cu.h b/paddle/phi/kernels/funcs/select_impl.cu.h
index 13b3bcc4d810f..4cc5c7181f51e 100644
--- a/paddle/phi/kernels/funcs/select_impl.cu.h
+++ b/paddle/phi/kernels/funcs/select_impl.cu.h
@@ -26,7 +26,7 @@
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
 #endif
-#ifdef __MCC__
+#ifdef __MUSACC__
 //TODO
 #endif
 
diff --git a/paddle/phi/kernels/kps/elementwise_add_kernel.cu b/paddle/phi/kernels/kps/elementwise_add_kernel.cu
index 45b8466aa0762..a771eb722004b 100644
--- a/paddle/phi/kernels/kps/elementwise_add_kernel.cu
+++ b/paddle/phi/kernels/kps/elementwise_add_kernel.cu
@@ -75,6 +75,7 @@ using muTensor = ::musa::dnn::Tensor;
 using BINARY_MODE = ::musa::dnn::Binary::Mode;
 muTensor CreateMUTensor(const DenseTensor& tensor) {
   muTensor mu_tensor;
+  mu_tensor.SetNdInfo(tensor.dims().size(), tensor.dims().Get());
   switch (tensor.dtype()) {
     case DataType::FLOAT32:
       mu_tensor.SetType(muTensor::Type::FLOAT);

From 3d7bbc0c6f5d3f0ccda4d1c6a3f0805680be2a50 Mon Sep 17 00:00:00 2001
From: "zhi.cai" <zhi.cai@mthreads.com>
Date: Thu, 3 Aug 2023 10:16:18 +0800
Subject: [PATCH 21/55] [MTAI-484] fix(build): modify MUSAAA to MUSA in some
 files

---
 paddle/fluid/framework/ir/CMakeLists.txt      |   3 +-
 .../conv_affine_channel_mkldnn_fuse_pass.cc   |   2 -
 .../framework/ir/mkldnn/cpu_quantize_pass.cc  |   2 -
 .../fluid/imperative/gradient_accumulator.cc  |  18 +-
 paddle/fluid/operators/expand_as_op.cc        |   2 +-
 paddle/fluid/operators/expand_op.cc           |   2 +-
 .../fluid/operators/fused/yolo_box_post_op.cu |   2 +-
 .../get_tensor_from_selected_rows_op.cc       |   2 +-
 .../fluid/operators/graph_khop_sampler_op.cu  |   2 +-
 paddle/fluid/operators/hinge_loss_op.cc       |   2 +-
 paddle/fluid/operators/im2sequence_op.cc      |   2 +-
 paddle/fluid/operators/isfinite_op.h          |   8 +-
 paddle/fluid/operators/l1_norm_op.cc          |   2 +-
 paddle/fluid/operators/load_op.cc             |   2 +-
 .../operators/math/bert_encoder_functor.h     |   2 +
 paddle/fluid/operators/matmul_op.cc           |   2 +-
 paddle/fluid/operators/minus_op.cc            |   2 +-
 paddle/fluid/operators/nop_op.cc              |   2 +-
 .../fluid/operators/pad_constant_like_op.cc   |   2 +-
 paddle/phi/kernels/CMakeLists.txt             |   1 +
 paddle/phi/kernels/funcs/CMakeLists.txt       |   2 +
 paddle/phi/kernels/funcs/blas/blas.h          |   2 +-
 paddle/phi/kernels/funcs/blas/blas_impl.mu.h  | 385 ++++++++++++++++++
 paddle/phi/kernels/funcs/eigen/CMakeLists.txt |   6 +-
 .../kernels/funcs/selected_rows_functor.cu    |   9 +-
 paddle/phi/kernels/reduce_all_kernel.cc       |   2 +-
 paddle/phi/kernels/reduce_any_kernel.cc       |   2 +-
 27 files changed, 431 insertions(+), 39 deletions(-)
 create mode 100644 paddle/phi/kernels/funcs/blas/blas_impl.mu.h

diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 6031a5e8afcf9..6f1075c3bf16d 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -192,8 +192,7 @@ if(WITH_MKLDNN)
   pass_library(operator_unsqueeze2_onednn_fuse_pass inference DIR mkldnn)
   pass_library(operator_reshape2_onednn_fuse_pass inference DIR mkldnn)
   pass_library(cpu_quantize_placement_pass base DIR mkldnn)
-  # TODO(@caizhi): enable it
-  #pass_library(cpu_quantize_pass inference DIR mkldnn)
+  pass_library(cpu_quantize_pass inference DIR mkldnn)
   pass_library(cpu_quantize_squash_pass inference DIR mkldnn)
   pass_library(reshape_transpose_matmul_mkldnn_fuse_pass inference DIR mkldnn)
   pass_library(matmul_transpose_reshape_mkldnn_fuse_pass inference DIR mkldnn)
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_affine_channel_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_affine_channel_mkldnn_fuse_pass.cc
index 8180c6c02f651..9639d3f374bef 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_affine_channel_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_affine_channel_mkldnn_fuse_pass.cc
@@ -54,7 +54,6 @@ void recompute_bias_and_weights(const Scope* scope,
                                 const ir::Node& ac_scale,
                                 const phi::DenseTensor& ac_bias_tensor,
                                 phi::DenseTensor* eltwise_y_in_tensor) {
-#if 0
   using EigenVectorArrayMap =
       Eigen::Map<Eigen::Array<float, Eigen::Dynamic, 1>>;
   using ConstEigenVectorArrayMap =
@@ -103,7 +102,6 @@ void recompute_bias_and_weights(const Scope* scope,
   for (int i = 0; i < weights->numel(); ++i) {
     if (std::fpclassify(weights_data[i]) == FP_SUBNORMAL) weights_data[i] = 0;
   }
-#endif
 }
 
 ConvAffineChannelFusePass::ConvAffineChannelFusePass() {
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
index 010a8aabdf1eb..e62b79e502f94 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
@@ -11,7 +11,6 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#if 0
 #include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h"
 
 #include <sstream>
@@ -1328,4 +1327,3 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const {
 
 REGISTER_PASS(cpu_quantize_pass, paddle::framework::ir::CPUQuantizePass)
     .RequirePassAttr("quant_var_scales");
-#endif
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index 8cc764be9ff39..8c78f7af783dd 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -204,7 +204,7 @@ void TensorAdd(const VarType& src, VarType* dst) {
   }
 
   if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     PADDLE_TENSOR_ADD(float, phi::GPUContext);
     PADDLE_TENSOR_ADD(double, phi::GPUContext);
     PADDLE_TENSOR_ADD(phi::dtype::float16, phi::GPUContext);
@@ -313,7 +313,7 @@ void SelectedRowsAddToTensor(const VarType& src, VarType* dst) {
     return;                                                              \
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (paddle::platform::is_gpu_place(place)) {
     PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::GPUContext, float);
     PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::GPUContext, double);
@@ -321,7 +321,7 @@ void SelectedRowsAddToTensor(const VarType& src, VarType* dst) {
 #endif
     PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::CPUContext, float);
     PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::CPUContext, double);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   }
 #endif
 
@@ -364,7 +364,7 @@ void SelectedRowsAddTensor(const VarType& src_selected_rows_var,
     return;                                                            \
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (platform::is_gpu_place(place)) {
     PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::GPUContext, float);
     PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::GPUContext, double);
@@ -372,7 +372,7 @@ void SelectedRowsAddTensor(const VarType& src_selected_rows_var,
 #endif
     PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::CPUContext, float);
     PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::CPUContext, double);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   }
 #endif
 
@@ -425,7 +425,7 @@ std::shared_ptr<ReturnVarType> SelectedRowsMerge(const VarType& src1,
     return dst_var;                                                  \
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (paddle::platform::is_gpu_place(place)) {
     PADDLE_SELECTED_ROWS_ADD(phi::GPUContext, float);
     PADDLE_SELECTED_ROWS_ADD(phi::GPUContext, double);
@@ -441,7 +441,7 @@ std::shared_ptr<ReturnVarType> SelectedRowsMerge(const VarType& src1,
 #if defined(PADDLE_WITH_XPU)
     }
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   }
 #endif
 
@@ -712,7 +712,7 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr<VariableWrapper> var,
         }
       }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       if (paddle::platform::is_gpu_place(place)) {
         // sum selected rows firstly
         for (auto& var_info : tmp_grad_vars_) {
@@ -778,7 +778,7 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr<VariableWrapper> var,
           // Increase count
           IncreaseCurCnt();
         }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       }
 #endif
       tmp_grad_vars_.clear();
diff --git a/paddle/fluid/operators/expand_as_op.cc b/paddle/fluid/operators/expand_as_op.cc
index f1ebaa147494b..f0d31269da193 100644
--- a/paddle/fluid/operators/expand_as_op.cc
+++ b/paddle/fluid/operators/expand_as_op.cc
@@ -155,7 +155,7 @@ REGISTER_OP_CPU_KERNEL(expand_as_grad,
                        ops::ExpandAsGradKernel<phi::CPUContext, int64_t>,
                        ops::ExpandAsGradKernel<phi::CPUContext, float>,
                        ops::ExpandAsGradKernel<phi::CPUContext, double>);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 REGISTER_OP_CUDA_KERNEL(expand_as,
                         ops::ExpandAsKernel<phi::GPUContext, float>,
                         ops::ExpandAsKernel<phi::GPUContext, double>,
diff --git a/paddle/fluid/operators/expand_op.cc b/paddle/fluid/operators/expand_op.cc
index 54af38ee3d429..490c6f9f6dbfc 100644
--- a/paddle/fluid/operators/expand_op.cc
+++ b/paddle/fluid/operators/expand_op.cc
@@ -283,7 +283,7 @@ REGISTER_OP_CPU_KERNEL(expand_grad,
                        ops::ExpandGradKernel<phi::CPUContext, double>,
                        ops::ExpandGradKernel<phi::CPUContext, int>,
                        ops::ExpandGradKernel<phi::CPUContext, int64_t>);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 REGISTER_OP_CUDA_KERNEL(
     expand,
     ops::ExpandKernel<phi::GPUContext, float>,
diff --git a/paddle/fluid/operators/fused/yolo_box_post_op.cu b/paddle/fluid/operators/fused/yolo_box_post_op.cu
index 6952720205475..902c73ff2cc15 100644
--- a/paddle/fluid/operators/fused/yolo_box_post_op.cu
+++ b/paddle/fluid/operators/fused/yolo_box_post_op.cu
@@ -252,7 +252,7 @@ static void YoloTensorParseCuda(
 
   // Estimate how many boxes will be choosed
   int bbox_count = 0;
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
   hipMemcpy(
       bbox_count_device_ptr, &bbox_count, sizeof(int), hipMemcpyHostToDevice);
 #elif defined(PADDLE_WITH_MUSA)
diff --git a/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc
index 471428b0b44ee..c6a8a4fe7b982 100644
--- a/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc
+++ b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc
@@ -111,7 +111,7 @@ PD_REGISTER_STRUCT_KERNEL(get_tensor_from_selected_rows,
                           int,
                           int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_STRUCT_KERNEL(get_tensor_from_selected_rows,
                           GPU,
                           ALL_LAYOUT,
diff --git a/paddle/fluid/operators/graph_khop_sampler_op.cu b/paddle/fluid/operators/graph_khop_sampler_op.cu
index 28acc712e2a98..c74ce6b5f4691 100644
--- a/paddle/fluid/operators/graph_khop_sampler_op.cu
+++ b/paddle/fluid/operators/graph_khop_sampler_op.cu
@@ -92,7 +92,7 @@ __global__ void GraphSampleNeighborsCUDAKernel(const uint64_t rand_seed,
   int64_t out_row = blockIdx.x * TILE_SIZE + threadIdx.y;
   const int64_t last_row =
       min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
   hiprandState rng;
   hiprand_init(rand_seed * gridDim.x + blockIdx.x,
                threadIdx.y * WARP_SIZE + threadIdx.x,
diff --git a/paddle/fluid/operators/hinge_loss_op.cc b/paddle/fluid/operators/hinge_loss_op.cc
index 329e7aa0f0607..ea38db87e63e7 100644
--- a/paddle/fluid/operators/hinge_loss_op.cc
+++ b/paddle/fluid/operators/hinge_loss_op.cc
@@ -156,7 +156,7 @@ PD_REGISTER_STRUCT_KERNEL(
 PD_REGISTER_STRUCT_KERNEL(
     hinge_loss_grad, CPU, ALL_LAYOUT, ops::HingeLossGradKernel, float) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_STRUCT_KERNEL(
     hinge_loss, GPU, ALL_LAYOUT, ops::HingeLossKernel, float) {}
 PD_REGISTER_STRUCT_KERNEL(
diff --git a/paddle/fluid/operators/im2sequence_op.cc b/paddle/fluid/operators/im2sequence_op.cc
index 860aee6d9e426..e1e9ca5ef6667 100644
--- a/paddle/fluid/operators/im2sequence_op.cc
+++ b/paddle/fluid/operators/im2sequence_op.cc
@@ -201,7 +201,7 @@ PD_REGISTER_STRUCT_KERNEL(
 PD_REGISTER_STRUCT_KERNEL(
     im2sequence_grad, CPU, ALL_LAYOUT, ops::Im2SequenceGradKernel, float) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_STRUCT_KERNEL(
     im2sequence, GPU, ALL_LAYOUT, ops::Im2SequenceKernel, float) {}
 PD_REGISTER_STRUCT_KERNEL(
diff --git a/paddle/fluid/operators/isfinite_op.h b/paddle/fluid/operators/isfinite_op.h
index 42f79646a670a..940b3eaac0c10 100644
--- a/paddle/fluid/operators/isfinite_op.h
+++ b/paddle/fluid/operators/isfinite_op.h
@@ -67,7 +67,7 @@ bool TensorIsfinite(const phi::DenseTensor& tensor);
 FiniteVisitor(Isnan, Any, CPU);
 FiniteVisitor(Isinf, Any, CPU);
 FiniteVisitor(Isfinite, All, CPU);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 FiniteVisitor(Isnan, Any, GPU);
 FiniteVisitor(Isinf, Any, GPU);
 FiniteVisitor(Isfinite, All, GPU);
@@ -82,7 +82,7 @@ inline void TensorContainsNAN(const phi::DenseTensor& tensor,
                         IsnanVisitorCPU(tensor, out));
     return;
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (platform::is_gpu_place(place)) {
     VisitDataTypeNormal(paddle::framework::TransToProtoVarType(tensor.dtype()),
                         IsnanVisitorGPU(tensor, out));
@@ -99,7 +99,7 @@ inline void TensorContainsInf(const phi::DenseTensor& tensor,
                         IsinfVisitorCPU(tensor, out));
     return;
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (platform::is_gpu_place(place)) {
     VisitDataTypeNormal(paddle::framework::TransToProtoVarType(tensor.dtype()),
                         IsinfVisitorGPU(tensor, out));
@@ -116,7 +116,7 @@ inline void TensorIsfinite(const phi::DenseTensor& tensor,
                         IsfiniteVisitorCPU(tensor, out));
     return;
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (platform::is_gpu_place(place)) {
     VisitDataTypeNormal(paddle::framework::TransToProtoVarType(tensor.dtype()),
                         IsfiniteVisitorGPU(tensor, out));
diff --git a/paddle/fluid/operators/l1_norm_op.cc b/paddle/fluid/operators/l1_norm_op.cc
index 1cab5b2551b80..2c6d72f109c13 100644
--- a/paddle/fluid/operators/l1_norm_op.cc
+++ b/paddle/fluid/operators/l1_norm_op.cc
@@ -96,7 +96,7 @@ PD_REGISTER_STRUCT_KERNEL(l1_norm, CPU, ALL_LAYOUT, ops::L1NormKernel, float) {}
 PD_REGISTER_STRUCT_KERNEL(
     l1_norm_grad, CPU, ALL_LAYOUT, ops::L1NormGradKernel, float) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_STRUCT_KERNEL(l1_norm, GPU, ALL_LAYOUT, ops::L1NormKernel, float) {}
 PD_REGISTER_STRUCT_KERNEL(
     l1_norm_grad, GPU, ALL_LAYOUT, ops::L1NormGradKernel, float) {}
diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc
index 8a8a705b629bf..197aaa74bb3e1 100644
--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
@@ -133,7 +133,7 @@ PD_REGISTER_KERNEL(load, CPU, ALL_LAYOUT, ops::LoadKernel, float) {}
 PD_REGISTER_KERNEL(
     load_sr, CPU, ALL_LAYOUT, ops::LoadSelectedRowsKernel, float) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(load, GPU, ALL_LAYOUT, ops::LoadKernel, float) {}
 PD_REGISTER_KERNEL(
     load_sr, GPU, ALL_LAYOUT, ops::LoadSelectedRowsKernel, float) {}
diff --git a/paddle/fluid/operators/math/bert_encoder_functor.h b/paddle/fluid/operators/math/bert_encoder_functor.h
index 8e5438fcd8036..b400ffb6a9cc4 100644
--- a/paddle/fluid/operators/math/bert_encoder_functor.h
+++ b/paddle/fluid/operators/math/bert_encoder_functor.h
@@ -17,6 +17,8 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
 #include <cuda_runtime.h>
+
+#include <cub/cub.cuh>  // NOLINT
 #endif
 #ifdef PADDLE_WITH_MUSA
 #include <musa.h>
diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc
index 5394c755e56df..aa08a7b6258b9 100644
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -926,7 +926,7 @@ REGISTER_OP_CPU_KERNEL(matmul_grad_grad,
                        ops::MatMulDoubleGradKernel<phi::CPUContext, float>,
                        ops::MatMulDoubleGradKernel<phi::CPUContext, double>);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 REGISTER_OP_CUDA_KERNEL(
     matmul,
     ops::MatMulKernel<phi::GPUContext, float>,
diff --git a/paddle/fluid/operators/minus_op.cc b/paddle/fluid/operators/minus_op.cc
index 6d4960b22411b..27a38571e1c80 100644
--- a/paddle/fluid/operators/minus_op.cc
+++ b/paddle/fluid/operators/minus_op.cc
@@ -157,6 +157,6 @@ REGISTER_OPERATOR(minus,
                   ops::MinusGradMaker);
 PD_REGISTER_STRUCT_KERNEL(minus, CPU, ALL_LAYOUT, ops::MinusKernel, float) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_STRUCT_KERNEL(minus, GPU, ALL_LAYOUT, ops::MinusKernel, float) {}
 #endif
diff --git a/paddle/fluid/operators/nop_op.cc b/paddle/fluid/operators/nop_op.cc
index 45d44e71b5775..e99b3956d05b0 100644
--- a/paddle/fluid/operators/nop_op.cc
+++ b/paddle/fluid/operators/nop_op.cc
@@ -60,6 +60,6 @@ REGISTER_OP_WITHOUT_GRADIENT(nop, ops::NopOp, ops::NopOpMaker);
 
 PD_REGISTER_STRUCT_KERNEL(nop, CPU, ALL_LAYOUT, ops::NopKernel, float) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_STRUCT_KERNEL(nop, GPU, ALL_LAYOUT, ops::NopKernel, float) {}
 #endif
diff --git a/paddle/fluid/operators/pad_constant_like_op.cc b/paddle/fluid/operators/pad_constant_like_op.cc
index ea090c6cdb40a..72061fbc39630 100644
--- a/paddle/fluid/operators/pad_constant_like_op.cc
+++ b/paddle/fluid/operators/pad_constant_like_op.cc
@@ -260,7 +260,7 @@ PD_REGISTER_STRUCT_KERNEL(pad_constant_like_grad,
                           int,
                           int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_STRUCT_KERNEL(pad_constant_like,
                           GPU,
                           ALL_LAYOUT,
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index 6fddeca9ad27b..a2ea898902ca3 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -49,6 +49,7 @@ file(
   "gpu/full_kernel.cu"
   "gpu/matmul_kernel.cu"
   "gpu/expand_kernel.cu"
+  "gpu/isfinite_kernel.cu"
   "kps/*.cu"
   "legacy/kps/*.cu"
   )
diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt
index 2be353d56c22e..b4de20d2d3e91 100644
--- a/paddle/phi/kernels/funcs/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/CMakeLists.txt
@@ -25,6 +25,8 @@ if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
     "sequence_pooling.cu"
     "softmax.cu"
     "matrix_inverse.cu"
+    "im2col.cu"
+    "selected_rows_functor.cu"
     "gather_scatter_functor.cu")
 endif()
 
diff --git a/paddle/phi/kernels/funcs/blas/blas.h b/paddle/phi/kernels/funcs/blas/blas.h
index 3b758882e4072..e2fe89382186a 100644
--- a/paddle/phi/kernels/funcs/blas/blas.h
+++ b/paddle/phi/kernels/funcs/blas/blas.h
@@ -594,5 +594,5 @@ inline BlasT<DeviceContext, T> GetBlas(const DeviceContext& dev_ctx) {
 #include "paddle/phi/kernels/funcs/blas/blas_impl.hip.h"
 #endif
 #ifdef PADDLE_WITH_MUSA
-// TODO
+#include "paddle/phi/kernels/funcs/blas/blas_impl.mu.h"
 #endif
diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.mu.h b/paddle/phi/kernels/funcs/blas/blas_impl.mu.h
new file mode 100644
index 0000000000000..cd644597df477
--- /dev/null
+++ b/paddle/phi/kernels/funcs/blas/blas_impl.mu.h
@@ -0,0 +1,385 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if defined(__MUSACC__)
+#include <thrust/device_vector.h>
+#endif
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/flags.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+PHI_DECLARE_bool(enable_cublas_tensor_op_math);
+PHI_DECLARE_bool(gemm_use_half_precision_compute_type);
+
+namespace phi {
+namespace funcs {
+
+
+template <>
+template <typename T>
+void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                 CBLAS_TRANSPOSE transB,
+                                 int M,
+                                 int N,
+                                 int K,
+                                 T alpha,
+                                 const T *A,
+                                 const T *B,
+                                 T beta,
+                                 T *C) const {
+}
+
+template <>
+template <>
+inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                        CBLAS_TRANSPOSE transB,
+                                        int M,
+                                        int N,
+                                        int K,
+                                        phi::dtype::float16 alpha,
+                                        const phi::dtype::float16 *A,
+                                        const phi::dtype::float16 *B,
+                                        phi::dtype::float16 beta,
+                                        phi::dtype::float16 *C) const {
+}
+
+template <>
+template <>
+inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                        CBLAS_TRANSPOSE transB,
+                                        int M,
+                                        int N,
+                                        int K,
+                                        phi::dtype::bfloat16 alpha,
+                                        const phi::dtype::bfloat16 *A,
+                                        const phi::dtype::bfloat16 *B,
+                                        phi::dtype::bfloat16 beta,
+                                        phi::dtype::bfloat16 *C) const {
+}
+
+template <>
+template <>
+inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                        CBLAS_TRANSPOSE transB,
+                                        int M,
+                                        int N,
+                                        int K,
+                                        phi::dtype::complex<float> alpha,
+                                        const phi::dtype::complex<float> *A,
+                                        const phi::dtype::complex<float> *B,
+                                        phi::dtype::complex<float> beta,
+                                        phi::dtype::complex<float> *C) const {
+}
+
+template <>
+template <>
+inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                        CBLAS_TRANSPOSE transB,
+                                        int M,
+                                        int N,
+                                        int K,
+                                        phi::dtype::complex<double> alpha,
+                                        const phi::dtype::complex<double> *A,
+                                        const phi::dtype::complex<double> *B,
+                                        phi::dtype::complex<double> beta,
+                                        phi::dtype::complex<double> *C) const {
+}
+
+template <>
+template <typename T>
+void Blas<phi::GPUContext>::GEMM(bool transA,
+                                 bool transB,
+                                 int M,
+                                 int N,
+                                 int K,
+                                 T alpha,
+                                 const T *A,
+                                 int lda,
+                                 const T *B,
+                                 int ldb,
+                                 T beta,
+                                 T *C,
+                                 int ldc) const {
+}
+
+template <>
+template <>
+inline void Blas<phi::GPUContext>::GEMM(bool transA,
+                                        bool transB,
+                                        int M,
+                                        int N,
+                                        int K,
+                                        phi::dtype::float16 alpha,
+                                        const phi::dtype::float16 *A,
+                                        int lda,
+                                        const phi::dtype::float16 *B,
+                                        int ldb,
+                                        phi::dtype::float16 beta,
+                                        phi::dtype::float16 *C,
+                                        int ldc) const {
+}
+
+template <>
+template <>
+inline void Blas<phi::GPUContext>::GEMM(bool transA,
+                                        bool transB,
+                                        int M,
+                                        int N,
+                                        int K,
+                                        phi::dtype::bfloat16 alpha,
+                                        const phi::dtype::bfloat16 *A,
+                                        int lda,
+                                        const phi::dtype::bfloat16 *B,
+                                        int ldb,
+                                        phi::dtype::bfloat16 beta,
+                                        phi::dtype::bfloat16 *C,
+                                        int ldc) const {
+}
+
+template <>
+template <typename T>
+void Blas<phi::GPUContext>::AXPY(int n, T alpha, const T *x, T *y) const {
+}
+
+template <>
+template <typename T>
+void Blas<phi::GPUContext>::SCAL(int n, const T alpha, T *x) const {
+}
+
+template <>
+template <typename T>
+void Blas<phi::GPUContext>::VCOPY(int n, const T *x, T *y) const {
+}
+
+template <>
+template <typename T>
+void Blas<phi::GPUContext>::GEMV(bool trans_a,
+                                 int M,
+                                 int N,
+                                 T alpha,
+                                 const T *A,
+                                 const T *B,
+                                 T beta,
+                                 T *C) const {
+}
+
+template <>
+template <>
+inline void Blas<phi::GPUContext>::GEMV(bool trans_a,
+                                        int M,
+                                        int N,
+                                        phi::dtype::float16 alpha,
+                                        const phi::dtype::float16 *A,
+                                        const phi::dtype::float16 *B,
+                                        phi::dtype::float16 beta,
+                                        phi::dtype::float16 *C) const {
+}
+
+template <>
+template <>
+inline void Blas<phi::GPUContext>::GEMV(bool trans_a,
+                                        int M,
+                                        int N,
+                                        phi::dtype::bfloat16 alpha,
+                                        const phi::dtype::bfloat16 *A,
+                                        const phi::dtype::bfloat16 *B,
+                                        phi::dtype::bfloat16 beta,
+                                        phi::dtype::bfloat16 *C) const {
+}
+
+template <>
+template <typename T>
+void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+                                        CBLAS_TRANSPOSE transB,
+                                        int M,
+                                        int N,
+                                        int K,
+                                        T alpha,
+                                        const T *A,
+                                        const T *B,
+                                        T beta,
+                                        T *C,
+                                        int batchCount,
+                                        int64_t strideA,
+                                        int64_t strideB) const {
+}
+
+template <>
+template <>
+inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+                                               CBLAS_TRANSPOSE transB,
+                                               int M,
+                                               int N,
+                                               int K,
+                                               phi::dtype::bfloat16 alpha,
+                                               const phi::dtype::bfloat16 *A,
+                                               const phi::dtype::bfloat16 *B,
+                                               phi::dtype::bfloat16 beta,
+                                               phi::dtype::bfloat16 *C,
+                                               int batchCount,
+                                               int64_t strideA,
+                                               int64_t strideB) const {
+}
+
+template <>
+template <typename T>
+void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+                                        CBLAS_TRANSPOSE transB,
+                                        int M,
+                                        int N,
+                                        int K,
+                                        T alpha,
+                                        const T **A,
+                                        const T **B,
+                                        T beta,
+                                        T **C,
+                                        int batchCount) const {
+}
+
+#if defined(__MUSACC__)
+template <>
+template <>
+inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+                                               CBLAS_TRANSPOSE transB,
+                                               int M,
+                                               int N,
+                                               int K,
+                                               double alpha,
+                                               const double **A,
+                                               const double **B,
+                                               double beta,
+                                               double **C,
+                                               int batchCount) const {
+}
+
+template <>
+template <>
+inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+                                               CBLAS_TRANSPOSE transB,
+                                               int M,
+                                               int N,
+                                               int K,
+                                               float alpha,
+                                               const float **A,
+                                               const float **B,
+                                               float beta,
+                                               float **C,
+                                               int batchCount) const {
+}
+
+template <>
+template <>
+inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+                                               CBLAS_TRANSPOSE transB,
+                                               int M,
+                                               int N,
+                                               int K,
+                                               phi::dtype::float16 alpha,
+                                               const phi::dtype::float16 **A,
+                                               const phi::dtype::float16 **B,
+                                               phi::dtype::float16 beta,
+                                               phi::dtype::float16 **C,
+                                               int batchCount) const {
+}
+
+template <>
+template <>
+inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+                                               CBLAS_TRANSPOSE transB,
+                                               int M,
+                                               int N,
+                                               int K,
+                                               phi::dtype::bfloat16 alpha,
+                                               const phi::dtype::bfloat16 **A,
+                                               const phi::dtype::bfloat16 **B,
+                                               phi::dtype::bfloat16 beta,
+                                               phi::dtype::bfloat16 **C,
+                                               int batchCount) const {
+}
+#endif
+
+template <>
+template <typename T>
+void Blas<phi::GPUContext>::TRSM(CBLAS_SIDE side,
+                                 CBLAS_UPLO uplo,
+                                 CBLAS_TRANSPOSE transA,
+                                 CBLAS_DIAG diag,
+                                 int M,
+                                 int N,
+                                 T alpha,
+                                 const T *A,
+                                 int lda,
+                                 T *B,
+                                 int ldb) const {
+}
+
+template <>
+template <typename T>
+void Blas<phi::GPUContext>::BatchedGETRF(
+    int n, T **a, int *ipiv, int *info, int batch_size) const {
+}
+
+template <>
+template <typename T>
+void Blas<phi::GPUContext>::BatchedGETRI(int n,
+                                         const T **a,
+                                         const int *ipiv,
+                                         T **a_inv,
+                                         int *info,
+                                         int batch_size) const {
+}
+
+template <>
+template <typename T>
+void Blas<phi::GPUContext>::BatchedMatInv(
+    int n, const T **a, T **a_inv, int *info, int batch_size) const {
+}
+
+template <>
+template <typename T>
+void Blas<phi::GPUContext>::BatchedGETRS(CBLAS_TRANSPOSE trans,
+                                         int n,
+                                         int nrhs,
+                                         const T **a,
+                                         int lda,
+                                         int *ipiv,
+                                         T **b,
+                                         int ldb,
+                                         int *info,
+                                         int batch_size) const {
+}
+
+template <>
+template <typename T>
+void Blas<phi::GPUContext>::BatchedTRSM(CBLAS_SIDE side,
+                                        CBLAS_UPLO uplo,
+                                        CBLAS_TRANSPOSE transA,
+                                        CBLAS_DIAG diag,
+                                        int M,
+                                        int N,
+                                        T alpha,
+                                        const T **A,
+                                        int lda,
+                                        T **B,
+                                        int ldb,
+                                        int batch_size) const {
+}
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/eigen/CMakeLists.txt b/paddle/phi/kernels/funcs/eigen/CMakeLists.txt
index a19ecee91b096..fe3189b44e318 100644
--- a/paddle/phi/kernels/funcs/eigen/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/eigen/CMakeLists.txt
@@ -7,6 +7,10 @@ file(
   GLOB EIGEN_CU_SOURCES
   RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
   "s*.cu"
-  "p*.cu")
+  "p*.cu"
+  "b*.cu"
+  "c*.cu"
+  "e*.cu"
+  "l*.cu")
 
 collect_srcs(kernels_srcs SRCS ${EIGEN_CC_SOURCES} ${EIGEN_CU_SOURCES})
diff --git a/paddle/phi/kernels/funcs/selected_rows_functor.cu b/paddle/phi/kernels/funcs/selected_rows_functor.cu
index 2947701befcc7..416c6b18b4c48 100644
--- a/paddle/phi/kernels/funcs/selected_rows_functor.cu
+++ b/paddle/phi/kernels/funcs/selected_rows_functor.cu
@@ -129,7 +129,8 @@ __global__ void SelectedRowsAddTensorKernel(const T* selected_rows,
     // Since index in rows of SelectedRows can be duplicate, we can not use
     // tensor_out[index] += selected_rows[index]; Instead, we have to use
     // AtomicAdd to avoid concurrent write error.
-    phi::CudaAtomicAdd(tensor_out + index, selected_rows[index]);
+    // TODO(@caizhi): enable it
+    // phi::CudaAtomicAdd(tensor_out + index, selected_rows[index]);
   }
 }
 }  // namespace
@@ -281,7 +282,8 @@ __global__ void SelectedRowsAddToTensorKernel(const T* selected_rows,
   for (int index = tid; index < row_numel; index += block_size) {
     // Since index in rows of SelectedRows can be duplicate, we have to use
     // Atomic Operation to avoid concurrent write error.
-    phi::CudaAtomicAdd(tensor_out + index, selected_rows[index]);
+    // TODO(@caizhi): enable it
+    // phi::CudaAtomicAdd(tensor_out + index, selected_rows[index]);
   }
 }
 }  // namespace
@@ -362,7 +364,8 @@ __global__ void MergeAddKernel(const T* input,
   input += ty * row_numel;
   out += out_idx * row_numel;
   for (int index = tid; index < row_numel; index += block_size) {
-    phi::CudaAtomicAdd(out + index, input[index]);
+    // TODO(@caizhi): enable it
+    // phi::CudaAtomicAdd(out + index, input[index]);
   }
 }
 
diff --git a/paddle/phi/kernels/reduce_all_kernel.cc b/paddle/phi/kernels/reduce_all_kernel.cc
index 9d49240424ff0..3b33c7f665e79 100644
--- a/paddle/phi/kernels/reduce_all_kernel.cc
+++ b/paddle/phi/kernels/reduce_all_kernel.cc
@@ -40,7 +40,7 @@ void AllKernel(const Context& dev_ctx,
 
 PD_REGISTER_KERNEL(all, CPU, ALL_LAYOUT, phi::AllKernel, bool) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(all, GPU, ALL_LAYOUT, phi::AllKernel, bool) {}
 #endif
 
diff --git a/paddle/phi/kernels/reduce_any_kernel.cc b/paddle/phi/kernels/reduce_any_kernel.cc
index 02a16da79dc99..0b6f4028b62ac 100644
--- a/paddle/phi/kernels/reduce_any_kernel.cc
+++ b/paddle/phi/kernels/reduce_any_kernel.cc
@@ -33,7 +33,7 @@ void AnyKernel(const Context& dev_ctx,
 
 PD_REGISTER_KERNEL(any, CPU, ALL_LAYOUT, phi::AnyKernel, bool) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(any, GPU, ALL_LAYOUT, phi::AnyKernel, bool) {}
 #endif
 

From 120dce01f31d7376d7a95d3f9c4fd99608a203fd Mon Sep 17 00:00:00 2001
From: "zhi.cai" <zhi.cai@mthreads.com>
Date: Thu, 3 Aug 2023 16:44:41 +0800
Subject: [PATCH 22/55] [MTAI-484] fix(build): update submodule eigen3 commit
 id

---
 third_party/eigen3 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/eigen3 b/third_party/eigen3
index 69dcad601a69a..6ad1f10acbc31 160000
--- a/third_party/eigen3
+++ b/third_party/eigen3
@@ -1 +1 @@
-Subproject commit 69dcad601a69a65c56791d46e24e395e1b1b4e05
+Subproject commit 6ad1f10acbc311dd82b20cce7f5c305ae8c3eaa9

From 8f96ac8a9a4f30bf3243df29e83565cc81187fd4 Mon Sep 17 00:00:00 2001
From: "zhi.cai" <zhi.cai@mthreads.com>
Date: Thu, 3 Aug 2023 17:00:27 +0800
Subject: [PATCH 23/55] [MTAI-484] fix(build): change commit id in eigen.cmake

---
 cmake/external/eigen.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index 06e37b3c8a602..a981007ba5aa5 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -17,7 +17,7 @@ include(ExternalProject)
 # update eigen to the commit id f612df27 on 03/16/2021
 set(EIGEN_PREFIX_DIR ${THIRD_PARTY_PATH}/eigen3)
 set(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3/src/extern_eigen3)
-set(EIGEN_TAG f612df273689a19d25b45ca4f8269463207c4fee)
+set(EIGEN_TAG 6ad1f10acbc311dd82b20cce7f5c305ae8c3eaa9)
 set(SOURCE_DIR ${PADDLE_SOURCE_DIR}/third_party/eigen3)
 
 if(WIN32)

From 3d50ac053f9a2286bc1cfebb89157c74ff290ef5 Mon Sep 17 00:00:00 2001
From: Xiaokang Shang <xiaokang.shang@mthreads.com>
Date: Thu, 3 Aug 2023 14:39:43 +0800
Subject: [PATCH 24/55] support add and abs operators

---
 paddle/phi/kernels/funcs/elementwise_base.h   | 48 +++++++++----------
 .../phi/kernels/kps/elementwise_add_kernel.cu | 42 ++--------------
 2 files changed, 27 insertions(+), 63 deletions(-)

diff --git a/paddle/phi/kernels/funcs/elementwise_base.h b/paddle/phi/kernels/funcs/elementwise_base.h
index 3aba2ccf3aff3..683696f810c80 100644
--- a/paddle/phi/kernels/funcs/elementwise_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_base.h
@@ -711,30 +711,30 @@ __global__ void VectorizedElementwiseKernel(
     kps::IndexType main_offset,
     int read_lens,
     Functor func) {
-  //kps::IndexType data_offset =
-  //    static_cast<kps::IndexType>(BLOCK_ID_X) * BLOCK_NUM_X * read_lens;
-  //kps::IndexType stride =
-  //    static_cast<kps::IndexType>(BLOCK_NUM_X) * GRID_NUM_X * read_lens;
-  //for (; data_offset < main_offset; data_offset += stride) {
-  //  VectorizedElementwiseKernelImpl<OutT,
-  //                                  Functor,
-  //                                  Arity,
-  //                                  NumOuts,
-  //                                  VecSize,
-  //                                  false>(
-  //      ins, outs, data_offset, read_lens * BLOCK_NUM_X, read_lens, func);
-  //}
-
-  //kps::IndexType remain = numel - data_offset;
-  //if (remain > 0) {
-  //  VectorizedElementwiseKernelImpl<OutT,
-  //                                  Functor,
-  //                                  Arity,
-  //                                  NumOuts,
-  //                                  VecSize,
-  //                                  true>(
-  //      ins, outs, data_offset, static_cast<int>(remain), read_lens, func);
-  //}
+  kps::IndexType data_offset =
+      static_cast<kps::IndexType>(BLOCK_ID_X) * BLOCK_NUM_X * read_lens;
+  kps::IndexType stride =
+      static_cast<kps::IndexType>(BLOCK_NUM_X) * GRID_NUM_X * read_lens;
+  for (; data_offset < main_offset; data_offset += stride) {
+    VectorizedElementwiseKernelImpl<OutT,
+                                    Functor,
+                                    Arity,
+                                    NumOuts,
+                                    VecSize,
+                                    false>(
+        ins, outs, data_offset, read_lens * BLOCK_NUM_X, read_lens, func);
+  }
+
+  kps::IndexType remain = numel - data_offset;
+  if (remain > 0) {
+    VectorizedElementwiseKernelImpl<OutT,
+                                    Functor,
+                                    Arity,
+                                    NumOuts,
+                                    VecSize,
+                                    true>(
+        ins, outs, data_offset, static_cast<int>(remain), read_lens, func);
+  }
 }
 
 template <typename OutT, typename Functor, int Arity, int NumOuts, int VecSize>
diff --git a/paddle/phi/kernels/kps/elementwise_add_kernel.cu b/paddle/phi/kernels/kps/elementwise_add_kernel.cu
index a771eb722004b..f0ca8f763822f 100644
--- a/paddle/phi/kernels/kps/elementwise_add_kernel.cu
+++ b/paddle/phi/kernels/kps/elementwise_add_kernel.cu
@@ -69,37 +69,12 @@ void Float32Bfloat16OrFloat16AddCudaFunctor(const Context& dev_ctx,
   }
 }
 
-// TODO(MTAI): The following code is temporary, which is just a demo for MUSA.
-// It will be removed later.
-using muTensor = ::musa::dnn::Tensor;
-using BINARY_MODE = ::musa::dnn::Binary::Mode;
-muTensor CreateMUTensor(const DenseTensor& tensor) {
-  muTensor mu_tensor;
-  mu_tensor.SetNdInfo(tensor.dims().size(), tensor.dims().Get());
-  switch (tensor.dtype()) {
-    case DataType::FLOAT32:
-      mu_tensor.SetType(muTensor::Type::FLOAT);
-      break;
-    case DataType::INT32:
-      mu_tensor.SetType(muTensor::Type::INT32);
-      break;
-    case DataType::INT64:
-      mu_tensor.SetType(muTensor::Type::INT64);
-      break;
-    default:
-      std::cerr << "=========mismatch dtype in add kernel=====\n";
-      throw;
-  }
-  mu_tensor.SetAddr(tensor.data());
-  return mu_tensor;
-}
-
 template <typename T, typename Context>
 void AddKernel(const Context& dev_ctx,
                const DenseTensor& x,
                const DenseTensor& y,
                DenseTensor* out) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_MUSA)
   if (x.dtype() == phi::DataType::FLOAT32 &&
       (y.dtype() == phi::DataType::BFLOAT16 ||
        y.dtype() == phi::DataType::FLOAT16)) {
@@ -107,19 +82,8 @@ void AddKernel(const Context& dev_ctx,
     Float32Bfloat16OrFloat16AddCudaFunctor<Type, Context>(dev_ctx, x, y, out);
   } else {
 #endif
-    // AddCudaFunctor<T, Context>(dev_ctx, x, y, -1, out);
-  dev_ctx.template Alloc<T>(out);
-  using muHandle = ::musa::dnn::Handle;
-  ::musa::dnn::Handle h;
-  muTensor musa_self = CreateMUTensor(x);
-  muTensor musa_other = CreateMUTensor(y);
-  muTensor musa_out = CreateMUTensor(*out);
-
-  ::musa::dnn::Binary binary_op;
-  binary_op.SetMode(BINARY_MODE::ADD);
-  binary_op.Run(h, musa_out, musa_self, musa_other);
-
-#ifdef PADDLE_WITH_CUDA
+    AddCudaFunctor<T, Context>(dev_ctx, x, y, -1, out);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_MUSA)
   }
 #endif
 }

From ffec2fc3f11e3f3b4f85f186d8eb10fc3cd485cc Mon Sep 17 00:00:00 2001
From: "zhi.cai" <zhi.cai@mthreads.com>
Date: Thu, 3 Aug 2023 23:34:45 +0800
Subject: [PATCH 25/55] [MTAI-484] fix(build): fix a bug in profiler.cc

---
 cmake/generic.cmake                                |  4 ++--
 paddle/fluid/operators/rank_loss_op.cc             |  2 +-
 paddle/fluid/operators/reshape_op.cc               |  2 +-
 paddle/fluid/operators/save_op.cc                  |  2 +-
 paddle/fluid/platform/CMakeLists.txt               | 11 +++++++++++
 paddle/fluid/platform/profiler.cc                  |  2 +-
 paddle/phi/kernels/CMakeLists.txt                  |  2 ++
 paddle/phi/kernels/activation_kernel.cc            |  2 +-
 paddle/phi/kernels/assign_kernel.cc                |  2 +-
 paddle/phi/kernels/check_memory_continue_kernel.cc |  2 +-
 paddle/phi/kernels/coalesce_tensor_kernel.cc       |  2 +-
 paddle/phi/kernels/dist_grad_kernel.cc             |  2 +-
 paddle/phi/kernels/flatten_grad_kernel.cc          |  2 +-
 paddle/phi/kernels/flatten_kernel.cc               |  2 +-
 paddle/phi/kernels/full_kernel.cc                  |  2 +-
 paddle/phi/kernels/is_empty_kernel.cc              |  2 +-
 paddle/phi/kernels/memcpy_kernel.cc                |  2 +-
 paddle/phi/kernels/npu_identity_kernel.cc          |  2 +-
 paddle/phi/kernels/prod_kernel.cc                  |  2 +-
 paddle/phi/kernels/reverse_kernel.cc               |  2 +-
 paddle/phi/kernels/shape_kernel.cc                 |  2 +-
 paddle/phi/kernels/sparse/empty_kernel.cc          |  2 +-
 paddle/phi/kernels/squeeze_grad_kernel.cc          |  2 +-
 paddle/phi/kernels/squeeze_kernel.cc               |  2 +-
 paddle/phi/kernels/strided_slice_grad_kernel.cc    |  2 +-
 paddle/phi/kernels/strided_slice_kernel.cc         |  2 +-
 paddle/phi/kernels/unsqueeze_grad_kernel.cc        |  2 +-
 paddle/phi/kernels/unsqueeze_kernel.cc             |  2 +-
 28 files changed, 40 insertions(+), 27 deletions(-)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 57b6cf39f095a..b423ae0a4bb1b 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -789,9 +789,9 @@ function(musa_library TARGET_NAME)
     if(musa_library_SRCS)
       # TODO(MTAI): enable compiling static library
       if(musa_library_SHARED OR musa_library_shared) # build *.so
-        add_library(${TARGET_NAME} SHARED ${musa_library_SRCS})
+        musa_add_library(${TARGET_NAME} SHARED ${musa_library_SRCS})
       else()
-        add_library(${TARGET_NAME} STATIC ${musa_library_SRCS})
+        musa_add_library(${TARGET_NAME} STATIC ${musa_library_SRCS})
         find_fluid_modules(${TARGET_NAME})
         find_phi_modules(${TARGET_NAME})
       endif()
diff --git a/paddle/fluid/operators/rank_loss_op.cc b/paddle/fluid/operators/rank_loss_op.cc
index 4d24896d37000..b9f05d663dba0 100644
--- a/paddle/fluid/operators/rank_loss_op.cc
+++ b/paddle/fluid/operators/rank_loss_op.cc
@@ -246,7 +246,7 @@ PD_REGISTER_STRUCT_KERNEL(
 PD_REGISTER_STRUCT_KERNEL(
     rank_loss_grad, CPU, ALL_LAYOUT, ops::RankLossGradKernel, float) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_STRUCT_KERNEL(
     rank_loss, GPU, ALL_LAYOUT, ops::RankLossKernel, float) {}
 PD_REGISTER_STRUCT_KERNEL(
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 1cd5ef11909a0..a089ad7d58fac 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -761,7 +761,7 @@ REGISTER_OPERATOR(reshape2_grad_grad,
                   ops::ReshapeDoubleGradOpNoNeedBufferVarInferer,
                   Reshape2DoubleGradInferShapeFunctor);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape,
                                 float,
                                 ops::ReshapeKernel,
diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc
index d5727d9eb9936..ab03d46486c2e 100644
--- a/paddle/fluid/operators/save_op.cc
+++ b/paddle/fluid/operators/save_op.cc
@@ -117,7 +117,7 @@ PD_REGISTER_KERNEL(save_sr,
                    phi::dtype::float16,
                    phi::dtype::bfloat16) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(save,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 527e7396fa488..31f25f9d628b7 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -298,6 +298,17 @@ elseif(WITH_ROCM)
          stats
          op_proto_maker
          shape_inference)
+elseif(WITH_MUSA)
+  musa_library(
+    profiler
+    SRCS profiler.cc profiler.cu
+    DEPS phi
+         gpu_info
+         enforce
+         new_profiler
+         stats
+         op_proto_maker
+         shape_inference)
 elseif(WITH_XPU)
   cc_library(
     profiler
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index c23abcee9d725..d1b557922af32 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -698,7 +698,7 @@ void EnableProfiler(ProfilerState state) {
   HostTraceLevel::GetInstance().SetLevel(option.trace_level);
   should_send_profile_state = true;
   phi::GetDeviceTracer()->Enable();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (phi::ProfilerHelper::g_state == ProfilerState::kCUDA ||
       phi::ProfilerHelper::g_state == ProfilerState::kAll ||
       phi::ProfilerHelper::g_state == ProfilerState::kCPU) {
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index a2ea898902ca3..f57be6cdc3fc6 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -46,7 +46,9 @@ file(
   "gpu/c*.cu"
   "gpu/s*.cu"
   "gpu/abs_kernel.cu"
+  "gpu/activation_kernel.cu"
   "gpu/full_kernel.cu"
+  "gpu/p_norm_grad_kernel.cu"
   "gpu/matmul_kernel.cu"
   "gpu/expand_kernel.cu"
   "gpu/isfinite_kernel.cu"
diff --git a/paddle/phi/kernels/activation_kernel.cc b/paddle/phi/kernels/activation_kernel.cc
index 0b324d584e4d4..9626621ae8657 100644
--- a/paddle/phi/kernels/activation_kernel.cc
+++ b/paddle/phi/kernels/activation_kernel.cc
@@ -32,7 +32,7 @@ using complex128 = ::phi::dtype::complex<double>;
 
 PD_REGISTER_KERNEL(relu6, CPU, ALL_LAYOUT, phi::Relu6Kernel, float, double) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(relu6,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/assign_kernel.cc b/paddle/phi/kernels/assign_kernel.cc
index 425ce19808ea4..c44b6333154cc 100644
--- a/paddle/phi/kernels/assign_kernel.cc
+++ b/paddle/phi/kernels/assign_kernel.cc
@@ -135,7 +135,7 @@ PD_REGISTER_KERNEL(assign_value,
                    int8_t,
                    int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL_FOR_ALL_DTYPE(assign,
                                  GPU,
                                  ALL_LAYOUT,
diff --git a/paddle/phi/kernels/check_memory_continue_kernel.cc b/paddle/phi/kernels/check_memory_continue_kernel.cc
index 87dcd2eaa01ac..9f4b51281cd37 100644
--- a/paddle/phi/kernels/check_memory_continue_kernel.cc
+++ b/paddle/phi/kernels/check_memory_continue_kernel.cc
@@ -88,7 +88,7 @@ PD_REGISTER_KERNEL(check_memory_continue,
                    float,
                    double) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(check_memory_continue,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/coalesce_tensor_kernel.cc b/paddle/phi/kernels/coalesce_tensor_kernel.cc
index 8a694bec4a9b8..58cacd21bba18 100644
--- a/paddle/phi/kernels/coalesce_tensor_kernel.cc
+++ b/paddle/phi/kernels/coalesce_tensor_kernel.cc
@@ -292,7 +292,7 @@ PD_REGISTER_KERNEL(coalesce_tensor,
 }
 #endif
 
-#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(coalesce_tensor,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/dist_grad_kernel.cc b/paddle/phi/kernels/dist_grad_kernel.cc
index 658135c36fd72..442290c3648e2 100644
--- a/paddle/phi/kernels/dist_grad_kernel.cc
+++ b/paddle/phi/kernels/dist_grad_kernel.cc
@@ -97,7 +97,7 @@ void DistGradKernel(const Context& dev_ctx,
 PD_REGISTER_KERNEL(
     dist_grad, CPU, ALL_LAYOUT, phi::DistGradKernel, float, double) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(
     dist_grad, GPU, ALL_LAYOUT, phi::DistGradKernel, float, double) {}
 #endif
diff --git a/paddle/phi/kernels/flatten_grad_kernel.cc b/paddle/phi/kernels/flatten_grad_kernel.cc
index 595f38e03910f..3ecef871d211d 100644
--- a/paddle/phi/kernels/flatten_grad_kernel.cc
+++ b/paddle/phi/kernels/flatten_grad_kernel.cc
@@ -46,7 +46,7 @@ PD_REGISTER_KERNEL(flatten_grad,
                    int,
                    int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(flatten_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/flatten_kernel.cc b/paddle/phi/kernels/flatten_kernel.cc
index b7b41782ba092..6b22ac7518179 100644
--- a/paddle/phi/kernels/flatten_kernel.cc
+++ b/paddle/phi/kernels/flatten_kernel.cc
@@ -75,7 +75,7 @@ PD_REGISTER_KERNEL(flatten,
                    int,
                    int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(flatten_infer,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/full_kernel.cc b/paddle/phi/kernels/full_kernel.cc
index e709be621c8d6..982b6a396c2a8 100644
--- a/paddle/phi/kernels/full_kernel.cc
+++ b/paddle/phi/kernels/full_kernel.cc
@@ -49,7 +49,7 @@ PD_REGISTER_KERNEL(full_batch_size_like,
                    bool) {
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
 }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(full_batch_size_like,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/is_empty_kernel.cc b/paddle/phi/kernels/is_empty_kernel.cc
index 674fb11a1b472..f420a419f5c67 100644
--- a/paddle/phi/kernels/is_empty_kernel.cc
+++ b/paddle/phi/kernels/is_empty_kernel.cc
@@ -43,7 +43,7 @@ PD_REGISTER_KERNEL(is_empty,
   kernel->OutputAt(0).SetDataType(phi::DataType::BOOL);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(is_empty,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/memcpy_kernel.cc b/paddle/phi/kernels/memcpy_kernel.cc
index d5dc8a5fe23f9..62a6cbc8ea840 100644
--- a/paddle/phi/kernels/memcpy_kernel.cc
+++ b/paddle/phi/kernels/memcpy_kernel.cc
@@ -162,7 +162,7 @@ PD_REGISTER_KERNEL_FOR_ALL_DTYPE(memcpy,
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL_FOR_ALL_DTYPE(memcpy_h2d,
                                  GPU,
                                  ALL_LAYOUT,
diff --git a/paddle/phi/kernels/npu_identity_kernel.cc b/paddle/phi/kernels/npu_identity_kernel.cc
index 341444fa7a32e..12d933af78733 100644
--- a/paddle/phi/kernels/npu_identity_kernel.cc
+++ b/paddle/phi/kernels/npu_identity_kernel.cc
@@ -62,7 +62,7 @@ PD_REGISTER_KERNEL(npu_identity,
                    bool,
                    phi::dtype::float16) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(npu_identity,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/prod_kernel.cc b/paddle/phi/kernels/prod_kernel.cc
index fb44e97af1768..4e5546ca0df01 100644
--- a/paddle/phi/kernels/prod_kernel.cc
+++ b/paddle/phi/kernels/prod_kernel.cc
@@ -40,7 +40,7 @@ PD_REGISTER_KERNEL(prod_infer,
                    int,
                    int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(prod_infer,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/reverse_kernel.cc b/paddle/phi/kernels/reverse_kernel.cc
index d906316e45974..d8c8f5a966376 100644
--- a/paddle/phi/kernels/reverse_kernel.cc
+++ b/paddle/phi/kernels/reverse_kernel.cc
@@ -61,7 +61,7 @@ PD_REGISTER_KERNEL(reverse_array,
                    float,
                    double) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 PD_REGISTER_KERNEL(reverse_array,
                    GPU,
diff --git a/paddle/phi/kernels/shape_kernel.cc b/paddle/phi/kernels/shape_kernel.cc
index a59823c2d788d..e7556d1401954 100644
--- a/paddle/phi/kernels/shape_kernel.cc
+++ b/paddle/phi/kernels/shape_kernel.cc
@@ -51,7 +51,7 @@ PD_REGISTER_KERNEL(shape,
   kernel->OutputAt(0).SetDataType(phi::DataType::INT32);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(shape,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/sparse/empty_kernel.cc b/paddle/phi/kernels/sparse/empty_kernel.cc
index b3cda1fd6aa2d..44ccdd3bda634 100644
--- a/paddle/phi/kernels/sparse/empty_kernel.cc
+++ b/paddle/phi/kernels/sparse/empty_kernel.cc
@@ -82,7 +82,7 @@ PD_REGISTER_KERNEL(empty_like_csr,
   kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(empty_like_coo,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/squeeze_grad_kernel.cc b/paddle/phi/kernels/squeeze_grad_kernel.cc
index e08efefe315e0..3eab4daf5740a 100644
--- a/paddle/phi/kernels/squeeze_grad_kernel.cc
+++ b/paddle/phi/kernels/squeeze_grad_kernel.cc
@@ -49,7 +49,7 @@ PD_REGISTER_KERNEL(squeeze_grad,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(squeeze_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/squeeze_kernel.cc b/paddle/phi/kernels/squeeze_kernel.cc
index c12bf472f4809..933540cd787e4 100644
--- a/paddle/phi/kernels/squeeze_kernel.cc
+++ b/paddle/phi/kernels/squeeze_kernel.cc
@@ -74,7 +74,7 @@ PD_REGISTER_KERNEL(squeeze,
                    int64_t,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(squeeze_infer,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/strided_slice_grad_kernel.cc b/paddle/phi/kernels/strided_slice_grad_kernel.cc
index cf6fbef1c4444..dd5bd42a3f48a 100644
--- a/paddle/phi/kernels/strided_slice_grad_kernel.cc
+++ b/paddle/phi/kernels/strided_slice_grad_kernel.cc
@@ -55,7 +55,7 @@ PD_REGISTER_KERNEL(strided_slice_grad,
                    phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(strided_slice_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/strided_slice_kernel.cc b/paddle/phi/kernels/strided_slice_kernel.cc
index 6ebe60a6bb1f1..79e43de25e9a8 100644
--- a/paddle/phi/kernels/strided_slice_kernel.cc
+++ b/paddle/phi/kernels/strided_slice_kernel.cc
@@ -46,7 +46,7 @@ PD_REGISTER_KERNEL(strided_slice,
                    phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(strided_slice,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/unsqueeze_grad_kernel.cc b/paddle/phi/kernels/unsqueeze_grad_kernel.cc
index c5aea299bfdd8..e294c3a983769 100644
--- a/paddle/phi/kernels/unsqueeze_grad_kernel.cc
+++ b/paddle/phi/kernels/unsqueeze_grad_kernel.cc
@@ -49,7 +49,7 @@ PD_REGISTER_KERNEL(unsqueeze_grad,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(unsqueeze_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/unsqueeze_kernel.cc b/paddle/phi/kernels/unsqueeze_kernel.cc
index f377952e8438e..6e03176857e4c 100644
--- a/paddle/phi/kernels/unsqueeze_kernel.cc
+++ b/paddle/phi/kernels/unsqueeze_kernel.cc
@@ -80,7 +80,7 @@ PD_REGISTER_KERNEL(unsqueeze,
                    int64_t,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(unsqueeze_infer,
                    GPU,
                    ALL_LAYOUT,

From 4445bc024a23a8194e483f00dca74961464b38da Mon Sep 17 00:00:00 2001
From: "zhi.cai" <zhi.cai@mthreads.com>
Date: Fri, 4 Aug 2023 10:43:33 +0800
Subject: [PATCH 26/55] [MTAI-484] fix(build): replace "MUSAAA" with "MUSA"

---
 paddle/phi/kernels/CMakeLists.txt             |  3 +++
 .../phi/kernels/funcs/distribution_helper.h   | 22 +++++++++++--------
 paddle/phi/kernels/funcs/sparse/softmax.cu.h  |  4 ++++
 paddle/phi/kernels/reduce_amax_kernel.cc      |  2 +-
 paddle/phi/kernels/reduce_amin_kernel.cc      |  2 +-
 paddle/phi/kernels/reduce_mean_kernel.cc      |  2 +-
 paddle/phi/kernels/reduce_min_kernel.cc       |  2 +-
 .../selected_rows/activation_kernel.cc        |  2 +-
 .../kernels/selected_rows/assign_kernel.cc    |  2 +-
 .../phi/kernels/selected_rows/full_kernel.cc  |  2 +-
 .../kernels/selected_rows/isfinite_kernel.cc  |  2 +-
 .../merge_selected_rows_kernel.cc             |  2 +-
 .../phi/kernels/selected_rows/scale_kernel.cc |  2 +-
 .../phi/kernels/selected_rows/shape_kernel.cc |  2 +-
 .../kernels/selected_rows/uniform_kernel.cc   |  2 +-
 paddle/phi/kernels/sparse/gpu/conv.cu.h       |  2 +-
 paddle/phi/kernels/sparse/gpu/slice_kernel.cu |  2 +-
 .../kernels/sparse/gpu/softmax_grad_kernel.cu |  2 +-
 .../kernels/sparse/gpu/sparse_utils_kernel.cu |  9 ++++++++
 .../sparse/sparse_utils_grad_kernel.cc        |  2 +-
 paddle/phi/kernels/transfer_layout_kernel.cc  |  2 +-
 21 files changed, 46 insertions(+), 26 deletions(-)

diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index f57be6cdc3fc6..8754b5bfbb6e6 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -46,6 +46,7 @@ file(
   "gpu/c*.cu"
   "gpu/s*.cu"
   "gpu/abs_kernel.cu"
+  "gpu/uniform_kernel.cu"
   "gpu/activation_kernel.cu"
   "gpu/full_kernel.cu"
   "gpu/p_norm_grad_kernel.cu"
@@ -53,6 +54,8 @@ file(
   "gpu/expand_kernel.cu"
   "gpu/isfinite_kernel.cu"
   "kps/*.cu"
+  "legacy/gpu/uniform_kernel.cu"
+  "sparse/gpu/mask_kernel.cu"
   "legacy/kps/*.cu"
   )
 list(REMOVE_ITEM kernel_cu
diff --git a/paddle/phi/kernels/funcs/distribution_helper.h b/paddle/phi/kernels/funcs/distribution_helper.h
index 2ae5c912db937..484f077583bef 100644
--- a/paddle/phi/kernels/funcs/distribution_helper.h
+++ b/paddle/phi/kernels/funcs/distribution_helper.h
@@ -192,7 +192,7 @@ struct normal_distribution<double> {
 #elif defined(__MUSACC__)
 template <typename T>
 struct uniform_distribution {
-  __device__ inline T operator()(murandStatePhilox4_32_10_t *state) const {
+  __device__ inline T operator()(murand_state_philox4x32_10 *state) const {
     return static_cast<T>(murand_uniform(state));
   }
   static constexpr int kReturnsCount = 1;
@@ -200,7 +200,7 @@ struct uniform_distribution {
 
 template <>
 struct uniform_distribution<float> {
-  __device__ inline float4 operator()(murandStatePhilox4_32_10_t *state) const {
+  __device__ inline float4 operator()(murand_state_philox4x32_10 *state) const {
     return murand_uniform4(state);
   }
   static constexpr int kReturnsCount = 4;
@@ -209,15 +209,15 @@ struct uniform_distribution<float> {
 template <>
 struct uniform_distribution<double> {
   __device__ inline double2 operator()(
-      murandStatePhilox4_32_10_t *state) const {
-    return murand_uniform2_double(state);
+      murand_state_philox4x32_10 *state) const {
+    return murand_uniform_double2(state);
   }
   static constexpr int kReturnsCount = 2;
 };
 
 template <>
 struct uniform_distribution<uint32_t> {
-  __device__ inline uint4 operator()(murandStatePhilox4_32_10_t *state) const {
+  __device__ inline uint4 operator()(murand_state_philox4x32_10 *state) const {
     return murand4(state);
   }
   static constexpr int kReturnsCount = 4;
@@ -226,7 +226,7 @@ struct uniform_distribution<uint32_t> {
 template <>
 struct uniform_distribution<uint64_t> {
   __device__ inline ulonglong2 operator()(
-      murandStatePhilox4_32_10_t *state) const {
+      murand_state_philox4x32_10 *state) const {
     ulonglong2 result;
     uint4 rand = murand4(state);
     result.x = (uint64_t)rand.x << 32 | rand.y;
@@ -238,7 +238,7 @@ struct uniform_distribution<uint64_t> {
 
 template <>
 struct normal_distribution<float> {
-  __device__ inline float4 operator()(murandStatePhilox4_32_10_t *state) const {
+  __device__ inline float4 operator()(murand_state_philox4x32_10 *state) const {
     return murand_normal4(state);
   }
   static constexpr int kReturnsCount = 4;
@@ -247,8 +247,8 @@ struct normal_distribution<float> {
 template <>
 struct normal_distribution<double> {
   __device__ inline double2 operator()(
-      murandStatePhilox4_32_10_t *state) const {
-    return murand_normal2_double(state);
+      murand_state_philox4x32_10 *state) const {
+    return murand_normal_double2(state);
   }
   static constexpr int kReturnsCount = 2;
 };
@@ -334,6 +334,10 @@ __global__ void DistributionKernel(size_t size,
   curandStatePhilox4_32_10_t state;
   curand_init(seed, idx + THREAD_ID_X, offset, &state);
   using SType = curandStatePhilox4_32_10_t;
+#elif defined(__MUSACC__)
+  murand_state_philox4x32_10 state;
+  murand_init(seed, idx + THREAD_ID_X, offset, &state);
+  using SType = murand_state_philox4x32_10;
 #else
   hiprandStatePhilox4_32_10_t state;
   hiprand_init(seed, idx + THREAD_ID_X, offset, &state);
diff --git a/paddle/phi/kernels/funcs/sparse/softmax.cu.h b/paddle/phi/kernels/funcs/sparse/softmax.cu.h
index 72f99bd6331c4..9ebcd7e195339 100644
--- a/paddle/phi/kernels/funcs/sparse/softmax.cu.h
+++ b/paddle/phi/kernels/funcs/sparse/softmax.cu.h
@@ -27,6 +27,8 @@ inline DenseTensor GetOffsets(const Context& dev_ctx,
                               const IntT dim) {
 #ifdef __HIPCC__
   const auto& policy = thrust::hip::par.on(dev_ctx.stream());
+#elif defined(__MUSACC__)
+  const auto& policy = thrust::musa::par.on(dev_ctx.stream());
 #else
   const auto& policy = thrust::cuda::par.on(dev_ctx.stream());
 #endif
@@ -87,6 +89,8 @@ std::tuple<DenseTensor, DenseTensor, DenseTensor, DenseTensor> ComputePoolMax(
     const IntT dim) {
 #ifdef __HIPCC__
   const auto& policy = thrust::hip::par.on(dev_ctx.stream());
+#elif defined(__MUSACC__)
+  const auto& policy = thrust::musa::par.on(dev_ctx.stream());
 #else
   const auto& policy = thrust::cuda::par.on(dev_ctx.stream());
 #endif
diff --git a/paddle/phi/kernels/reduce_amax_kernel.cc b/paddle/phi/kernels/reduce_amax_kernel.cc
index f6f870dc59ed4..466d0497b2d8e 100644
--- a/paddle/phi/kernels/reduce_amax_kernel.cc
+++ b/paddle/phi/kernels/reduce_amax_kernel.cc
@@ -34,7 +34,7 @@ void AMaxKernel(const Context& dev_ctx,
 PD_REGISTER_KERNEL(
     amax, CPU, ALL_LAYOUT, phi::AMaxKernel, float, double, int, int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(
     amax, GPU, ALL_LAYOUT, phi::AMaxKernel, float, double, int, int64_t) {}
 #endif
diff --git a/paddle/phi/kernels/reduce_amin_kernel.cc b/paddle/phi/kernels/reduce_amin_kernel.cc
index da6ce3152586d..a30ab4a91956d 100644
--- a/paddle/phi/kernels/reduce_amin_kernel.cc
+++ b/paddle/phi/kernels/reduce_amin_kernel.cc
@@ -34,7 +34,7 @@ void AMinKernel(const Context& dev_ctx,
 PD_REGISTER_KERNEL(
     amin, CPU, ALL_LAYOUT, phi::AMinKernel, float, double, int, int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(
     amin, GPU, ALL_LAYOUT, phi::AMinKernel, float, double, int, int64_t) {}
 #endif
diff --git a/paddle/phi/kernels/reduce_mean_kernel.cc b/paddle/phi/kernels/reduce_mean_kernel.cc
index 4145ad1ed92fd..fb8ea2f97bbea 100644
--- a/paddle/phi/kernels/reduce_mean_kernel.cc
+++ b/paddle/phi/kernels/reduce_mean_kernel.cc
@@ -41,7 +41,7 @@ PD_REGISTER_KERNEL(mean,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(mean,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/reduce_min_kernel.cc b/paddle/phi/kernels/reduce_min_kernel.cc
index dd182f6f7c02f..31234c2660230 100644
--- a/paddle/phi/kernels/reduce_min_kernel.cc
+++ b/paddle/phi/kernels/reduce_min_kernel.cc
@@ -57,7 +57,7 @@ PD_REGISTER_KERNEL(
     min, GPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t) {}
 #endif
 
-#if defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(
     min, GPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t, phi::dtype::float16) {}
 #endif
diff --git a/paddle/phi/kernels/selected_rows/activation_kernel.cc b/paddle/phi/kernels/selected_rows/activation_kernel.cc
index a886e9d1eea25..6bd55f701bb33 100644
--- a/paddle/phi/kernels/selected_rows/activation_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/activation_kernel.cc
@@ -49,7 +49,7 @@ PD_REGISTER_KERNEL(
 PD_REGISTER_KERNEL(
     sqrt_sr, CPU, ALL_LAYOUT, phi::sr::SqrtKernel, float, double) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 PD_REGISTER_KERNEL(square_sr,
                    GPU,
diff --git a/paddle/phi/kernels/selected_rows/assign_kernel.cc b/paddle/phi/kernels/selected_rows/assign_kernel.cc
index 26136da2da26a..481f5f6fcf852 100644
--- a/paddle/phi/kernels/selected_rows/assign_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/assign_kernel.cc
@@ -41,7 +41,7 @@ PD_REGISTER_KERNEL_FOR_ALL_DTYPE(assign_sr,
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL_FOR_ALL_DTYPE(assign_sr,
                                  GPU,
                                  ALL_LAYOUT,
diff --git a/paddle/phi/kernels/selected_rows/full_kernel.cc b/paddle/phi/kernels/selected_rows/full_kernel.cc
index 215540ca617ad..b593e6db3f936 100644
--- a/paddle/phi/kernels/selected_rows/full_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/full_kernel.cc
@@ -54,7 +54,7 @@ PD_REGISTER_KERNEL(full_sr,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(full_sr,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/selected_rows/isfinite_kernel.cc b/paddle/phi/kernels/selected_rows/isfinite_kernel.cc
index d08c11129e236..e3489f50e2184 100644
--- a/paddle/phi/kernels/selected_rows/isfinite_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/isfinite_kernel.cc
@@ -51,7 +51,7 @@ PD_REGISTER_KERNEL(isfinite_sr,
                    int,
                    int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(isinf_sr,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/selected_rows/merge_selected_rows_kernel.cc b/paddle/phi/kernels/selected_rows/merge_selected_rows_kernel.cc
index 0509d2f791829..7b6f7e9ceefa4 100644
--- a/paddle/phi/kernels/selected_rows/merge_selected_rows_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/merge_selected_rows_kernel.cc
@@ -41,7 +41,7 @@ PD_REGISTER_KERNEL(merge_selected_rows,
                    float,
                    double) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(merge_selected_rows,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/selected_rows/scale_kernel.cc b/paddle/phi/kernels/selected_rows/scale_kernel.cc
index 41c48c682757f..f6f9d587c4022 100644
--- a/paddle/phi/kernels/selected_rows/scale_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/scale_kernel.cc
@@ -54,7 +54,7 @@ PD_REGISTER_KERNEL(scale_sr,
                    int,
                    int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(scale_sr,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/selected_rows/shape_kernel.cc b/paddle/phi/kernels/selected_rows/shape_kernel.cc
index b520b1a80536a..0a07bee7b6974 100644
--- a/paddle/phi/kernels/selected_rows/shape_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/shape_kernel.cc
@@ -52,7 +52,7 @@ PD_REGISTER_KERNEL(shape_sr,
   kernel->OutputAt(0).SetDataType(phi::DataType::INT32);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(shape_sr,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/selected_rows/uniform_kernel.cc b/paddle/phi/kernels/selected_rows/uniform_kernel.cc
index ebcd59e0c11da..90bee1744e962 100644
--- a/paddle/phi/kernels/selected_rows/uniform_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/uniform_kernel.cc
@@ -77,7 +77,7 @@ PD_REGISTER_KERNEL(uniform_sr,
                    double,
                    phi::dtype::bfloat16) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 PD_REGISTER_KERNEL(uniform_raw_sr,
                    GPU,
diff --git a/paddle/phi/kernels/sparse/gpu/conv.cu.h b/paddle/phi/kernels/sparse/gpu/conv.cu.h
index ad759d94c3ddf..660747b564fb8 100644
--- a/paddle/phi/kernels/sparse/gpu/conv.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/conv.cu.h
@@ -606,7 +606,7 @@ inline void CallThrustScan(const GPUContext& dev_ctx,
                            int* h_offsets_ptr) {
 #ifdef PADDLE_WITH_HIP
   thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
-#elif definfed(PADDLE_WITH_MUSA)
+#elif defined(PADDLE_WITH_MUSA)
   thrust::exclusive_scan(thrust::musa::par.on(dev_ctx.stream()),
 #else
   thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
diff --git a/paddle/phi/kernels/sparse/gpu/slice_kernel.cu b/paddle/phi/kernels/sparse/gpu/slice_kernel.cu
index c998de2df3e46..3f2eef2bf9c3d 100644
--- a/paddle/phi/kernels/sparse/gpu/slice_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/slice_kernel.cu
@@ -324,7 +324,7 @@ void SliceCsrTensor2D(const Context& dev_ctx,
                                                         out_crows_data);
 #ifdef PADDLE_WITH_HIP
   thrust::inclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
-#ifdef PADDLE_WITH_MUSA
+#elif defined(PADDLE_WITH_MUSA)
   thrust::inclusive_scan(thrust::musa::par.on(dev_ctx.stream()),
 #else
   thrust::inclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
diff --git a/paddle/phi/kernels/sparse/gpu/softmax_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/softmax_grad_kernel.cu
index 7be1b96b7ba52..fb079a757550f 100644
--- a/paddle/phi/kernels/sparse/gpu/softmax_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/softmax_grad_kernel.cu
@@ -218,7 +218,7 @@ void SoftmaxCooGradGPUKernel(const Context& dev_ctx,
   bool is_same_offset = thrust::equal(thrust::hip::par.on(dev_ctx.stream()),
 #elif defined(PADDLE_WITH_MUSA)
   const auto& policy = thrust::musa::par.on(dev_ctx.stream());
-  bool is_same_offset = thrust::equal(thrust::hip::par.on(dev_ctx.stream()),
+  bool is_same_offset = thrust::equal(thrust::musa::par.on(dev_ctx.stream()),
 #else
   const auto& policy = thrust::cuda::par.on(dev_ctx.stream());
   bool is_same_offset = thrust::equal(thrust::cuda::par.on(dev_ctx.stream()),
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
index abc1f18f984b2..0577633e53e5a 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
@@ -282,6 +282,15 @@ void CsrToCooGPUKernel(const GPUContext& dev_ctx,
     PADDLE_THROW(
         phi::errors::Unimplemented("'rocsparse_csr2coo' only supports batches "
                                    "with a value of 1 currently."));
+#elif defined(PADDLE_WITH_MUSA)
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, batches, 1);
+    GetBatchSizes<IntT><<<config.block_per_grid.x, config.thread_per_block.x>>>(
+        csr_crows_data, rows, batches, offsets_ptr);
+
+    thrust::exclusive_scan(thrust::musa::par.on(dev_ctx.stream()),
+                           offsets_ptr,
+                           offsets_ptr + batches,
+                           offsets_ptr);
 #else
     auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, batches, 1);
     GetBatchSizes<IntT><<<config.block_per_grid.x, config.thread_per_block.x>>>(
diff --git a/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc b/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc
index a9f16a255ba1e..8e9ed654760f3 100644
--- a/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc
@@ -81,7 +81,7 @@ PD_REGISTER_KERNEL(sparse_coo_tensor_grad,
   kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_COO);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(values_coo_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/transfer_layout_kernel.cc b/paddle/phi/kernels/transfer_layout_kernel.cc
index 16aeb761d308f..5ee69e5964918 100644
--- a/paddle/phi/kernels/transfer_layout_kernel.cc
+++ b/paddle/phi/kernels/transfer_layout_kernel.cc
@@ -221,7 +221,7 @@ PD_REGISTER_KERNEL_FOR_ALL_DTYPE(transfer_layout,
                                  CPU,
                                  ALL_LAYOUT,
                                  phi::TransferLayoutKernel<phi::CPUContext>) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL_FOR_ALL_DTYPE(transfer_layout,
                                  GPU,
                                  ALL_LAYOUT,

From c97850ca105d25b64e73e42fc937c4e4b51c88c5 Mon Sep 17 00:00:00 2001
From: "zhi.cai" <zhi.cai@mthreads.com>
Date: Fri, 4 Aug 2023 10:52:21 +0800
Subject: [PATCH 27/55] [MTAI-484] fix(build): add an empty file for
 sparse_blas_impl.mu.h

---
 paddle/phi/kernels/funcs/sparse/sparse_blas_impl.mu.h | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 paddle/phi/kernels/funcs/sparse/sparse_blas_impl.mu.h

diff --git a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.mu.h b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.mu.h
new file mode 100644
index 0000000000000..1f4ffb82624a4
--- /dev/null
+++ b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.mu.h
@@ -0,0 +1,3 @@
+#pragma once
+
+#include "glog/logging.h"

From c8bccc4a2114c5e4a9a49f74f3d2a8b1b30ee933 Mon Sep 17 00:00:00 2001
From: "zhi.cai" <zhi.cai@mthreads.com>
Date: Fri, 4 Aug 2023 16:53:09 +0800
Subject: [PATCH 28/55] [MTAI-484] feat(build): support dynload module for MUSA

---
 paddle/fluid/platform/dynload/mublas.cc       | 27 ++++++
 paddle/fluid/platform/dynload/mublas.h        | 78 +++++++++++++++++
 paddle/fluid/platform/dynload/musa_driver.cc  | 11 ++-
 paddle/fluid/platform/dynload/musa_driver.h   | 33 +++++++
 paddle/phi/CMakeLists.txt                     | 10 ---
 paddle/phi/backends/dynload/CMakeLists.txt    |  3 +-
 paddle/phi/backends/dynload/dynamic_loader.cc | 34 ++++++++
 paddle/phi/backends/dynload/mublas.cc         | 28 ++++++
 paddle/phi/backends/dynload/mublas.h          | 86 +++++++++++++++++++
 paddle/phi/backends/dynload/musa_driver.cc    | 12 ++-
 paddle/phi/backends/dynload/musa_driver.h     | 48 ++++++++++-
 paddle/phi/backends/gpu/gpu_resources.cc      |  6 +-
 12 files changed, 358 insertions(+), 18 deletions(-)
 create mode 100644 paddle/fluid/platform/dynload/mublas.cc
 create mode 100644 paddle/phi/backends/dynload/mublas.cc

diff --git a/paddle/fluid/platform/dynload/mublas.cc b/paddle/fluid/platform/dynload/mublas.cc
new file mode 100644
index 0000000000000..ae98e1a5c01bd
--- /dev/null
+++ b/paddle/fluid/platform/dynload/mublas.cc
@@ -0,0 +1,27 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/dynload/mublas.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+MUBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/mublas.h b/paddle/fluid/platform/dynload/mublas.h
index e69de29bb2d1d..a2d29535f6615 100644
--- a/paddle/fluid/platform/dynload/mublas.h
+++ b/paddle/fluid/platform/dynload/mublas.h
@@ -0,0 +1,78 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <mublas.h>
+#include <musa.h>
+
+#include <mutex>  // NOLINT
+#include <type_traits>
+
+#include "paddle/phi/backends/dynload/mublas.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load mublas routine
+ * via operator overloading.
+ *
+ * note: default dynamic linked libs
+ */
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP(__name)    \
+  using DynLoad__##__name = phi::dynload::DynLoad__##__name; \
+  extern DynLoad__##__name __name
+
+#define MUBLAS_BLAS_ROUTINE_EACH(__macro) \
+  __macro(mublasSaxpy);                \
+  __macro(mublasDaxpy);                \
+  __macro(mublasCaxpy);                \
+  __macro(mublasZaxpy);                \
+  __macro(mublasSscal);                \
+  __macro(mublasDscal);                \
+  __macro(mublasScopy);                \
+  __macro(mublasDcopy);                \
+  __macro(mublasSgemv);                \
+  __macro(mublasDgemv);                \
+  __macro(mublasCgemv);                \
+  __macro(mublasZgemv);                \
+  __macro(mublasSgemm);                \
+  __macro(mublasDgemm);                \
+  __macro(mublasCgemm);                \
+  __macro(mublasZgemm);                \
+  __macro(mublasHgemm);                \
+  __macro(mublasSgeam);                \
+  __macro(mublasDgeam);                \
+  __macro(mublasDtrsm);                \
+  __macro(mublasCtrsm);                \
+  __macro(mublasZtrsm);                \
+  __macro(mublasCreate);               \
+  __macro(mublasDestroy);              \
+  __macro(mublasSetStream);            \
+  __macro(mublasSetPointerMode);       \
+  __macro(mublasGetPointerMode);       \
+  __macro(mublasSgemmBatched);         \
+  __macro(mublasDgemmBatched);         \
+  __macro(mublasCgemmBatched);         \
+  __macro(mublasZgemmBatched);
+
+MUBLAS_BLAS_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP)
+
+#undef PLATFORM_DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/musa_driver.cc b/paddle/fluid/platform/dynload/musa_driver.cc
index 2015bbed28cbd..8898bd4dfb654 100644
--- a/paddle/fluid/platform/dynload/musa_driver.cc
+++ b/paddle/fluid/platform/dynload/musa_driver.cc
@@ -12,13 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/platform/dynload/musa_driver.h"
+
+#include "paddle/phi/backends/dynload/musa_driver.h"
+
 namespace paddle {
 namespace platform {
 namespace dynload {
 
-bool HasCUDADriver() { return false; }
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+MUSA_ROUTINE_EACH(DEFINE_WRAP);
+
+bool HasCUDADriver() { return phi::dynload::HasCUDADriver(); }
 
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
-
diff --git a/paddle/fluid/platform/dynload/musa_driver.h b/paddle/fluid/platform/dynload/musa_driver.h
index a55f0bd70f967..a2f653d42bdfd 100644
--- a/paddle/fluid/platform/dynload/musa_driver.h
+++ b/paddle/fluid/platform/dynload/musa_driver.h
@@ -14,12 +14,45 @@ limitations under the License. */
 
 #pragma once
 
+#include <musa.h>
+
+#include <mutex>  // NOLINT
+
+#include "paddle/phi/backends/dynload/musa_driver.h"
+
 namespace paddle {
 namespace platform {
 namespace dynload {
 
 extern bool HasCUDADriver();
 
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_MUSA_WRAP(__name)      \
+  using DynLoad__##__name = phi::dynload::DynLoad__##__name; \
+  extern DynLoad__##__name __name
+
+/**
+ * include all needed musa driver functions
+ **/
+#define MUSA_ROUTINE_EACH(__macro)                      \
+  __macro(muInit);                                      \
+  __macro(muDriverGetVersion);                          \
+  __macro(muGetErrorString);                            \
+  __macro(muModuleLoadData);                            \
+  __macro(muModuleGetFunction);                         \
+  __macro(muModuleUnload);                              \
+  __macro(muOccupancyMaxActiveBlocksPerMultiprocessor); \
+  __macro(muLaunchKernel);                              \
+  __macro(muCtxCreate);                                 \
+  __macro(muCtxGetCurrent);                             \
+  __macro(muDeviceGetCount);                            \
+  __macro(muDevicePrimaryCtxGetState);                  \
+  __macro(muDeviceGetAttribute);                        \
+  __macro(muDeviceGet)
+
+MUSA_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MUSA_WRAP);
+
+#undef PLATFORM_DECLARE_DYNAMIC_LOAD_MUSA_WRAP
+
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt
index 0e9a77d91db87..593109d3e8e27 100644
--- a/paddle/phi/CMakeLists.txt
+++ b/paddle/phi/CMakeLists.txt
@@ -51,16 +51,6 @@ if(WITH_GPU)
   list(APPEND PHI_DEPS external_error_proto)
 endif()
 
-# TODO(@caizhi): optimize me instead of hard code
-if(WITH_MUSA)
-  set(DEPENDENT_LIBRARIES "")
-  list(APPEND DEPENDENT_LIBRARIES "/usr/local/musa/lib/libmudnn.so")
-  list(APPEND DEPENDENT_LIBRARIES "/usr/local/musa/lib/libmccl.so")
-  list(APPEND DEPENDENT_LIBRARIES "/usr/local/musa/lib/libmusart.so")
-  list(APPEND DEPENDENT_LIBRARIES "/usr/local/musa/lib/libmublas.so")
-  list(APPEND PHI_DEPS ${DEPENDENT_LIBRARIES})
-endif()
-
 if(WITH_ASCEND_CL)
   list(APPEND PHI_DEPS npu_hccl)
 endif()
diff --git a/paddle/phi/backends/dynload/CMakeLists.txt b/paddle/phi/backends/dynload/CMakeLists.txt
index c55d34a077276..5e481e6d59aa3 100644
--- a/paddle/phi/backends/dynload/CMakeLists.txt
+++ b/paddle/phi/backends/dynload/CMakeLists.txt
@@ -34,7 +34,8 @@ if(WITH_MUSA)
   list(
     APPEND
     MUSA_SRCS
-    mudnn.cc)
+    mudnn.cc
+    mublas.cc)
 endif()
 
 # There is no macOS version of NCCL.
diff --git a/paddle/phi/backends/dynload/dynamic_loader.cc b/paddle/phi/backends/dynload/dynamic_loader.cc
index ac669381c0415..3c21c5fe69a2f 100644
--- a/paddle/phi/backends/dynload/dynamic_loader.cc
+++ b/paddle/phi/backends/dynload/dynamic_loader.cc
@@ -95,6 +95,27 @@ PHI_DEFINE_string(rccl_dir,
                   "dlopen will search rccl from LD_LIBRARY_PATH");
 #endif
 
+#ifdef PADDLE_WITH_MUSA
+
+PHI_DEFINE_string(mudnn_dir,
+                  "",
+                  "Specify path for loading libmudnn.so. For instance, "
+                  "/usr/local/musa/lib. If empty [default], dlopen "
+                  "will search libmudnn.so from LD_LIBRARY_PATH");
+
+PHI_DEFINE_string(musa_dir,
+                  "",
+                  "Specify path for loading musa library, such as libmublas, "
+                  "libmurand, libmusparse. For instance, /usr/local/musa/lib. "
+                  "If default, dlopen will search rocm from LD_LIBRARY_PATH");
+
+PHI_DEFINE_string(mccl_dir,
+                  "",
+                  "Specify path for loading mccl library, such as libmccl.so. "
+                  "For instance, /usr/local/musa/lib. If default, "
+                  "dlopen will search mccl from LD_LIBRARY_PATH");
+#endif
+
 #ifdef PADDLE_WITH_XPU
 DEFINE_string(xpti_dir, "", "Specify path for loading libxpti.so.");
 #endif
@@ -319,6 +340,8 @@ void* GetCublasDsoHandle() {
       FLAGS_cuda_dir, win_cublas_lib, true, {cuda_lib_path});
 #elif defined(PADDLE_WITH_HIP)
   return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocblas.so");
+#elif defined(PADDLE_WITH_MUSA)
+  return GetDsoHandleFromSearchPath(FLAGS_musa_dir, "libmublas.so");
 #else
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so");
 #endif
@@ -360,6 +383,8 @@ void* GetCUDNNDsoHandle() {
       FLAGS_cudnn_dir, win_cudnn_lib, true, {cuda_lib_path}, win_warn_meg);
 #elif defined(PADDLE_WITH_HIP)
   return GetDsoHandleFromSearchPath(FLAGS_miopen_dir, "libMIOpen.so", false);
+#elif defined(PADDLE_WITH_MUSA)
+  return GetDsoHandleFromSearchPath(FLAGS_mudnn_dir, "libmudnn.so", false);
 #else
   return GetDsoHandleFromSearchPath(
       FLAGS_cudnn_dir, "libcudnn.so", false, {cuda_lib_path});
@@ -384,6 +409,8 @@ void* GetCurandDsoHandle() {
       FLAGS_cuda_dir, win_curand_lib, true, {cuda_lib_path});
 #elif defined(PADDLE_WITH_HIP)
   return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhiprand.so");
+#elif defined(PADDLE_WITH_MUSA)
+  return GetDsoHandleFromSearchPath(FLAGS_musa_dir, "libmurand.so");
 #else
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so");
 #endif
@@ -429,6 +456,8 @@ void* GetCusparseDsoHandle() {
       FLAGS_cuda_dir, win_cusparse_lib, true, {cuda_lib_path});
 #elif defined(PADDLE_WITH_HIP)
   return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocsparse.so");
+#elif defined(PADDLE_WITH_MUSA)
+  return GetDsoHandleFromSearchPath(FLAGS_musa_dir, "libmusparse.so");
 #else
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusparse.so");
 #endif
@@ -449,6 +478,8 @@ void* GetCUDADsoHandle() {
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.dylib", false);
 #elif defined(PADDLE_WITH_HIP)
   return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libamdhip64.so", false);
+#elif defined(PADDLE_WITH_MUSA)
+  return GetDsoHandleFromSearchPath(FLAGS_musa_dir, "libmusa.so", false);
 #elif defined(_WIN32)
   char system32_dir[MAX_PATH];
   GetSystemDirectory(system32_dir, MAX_PATH);
@@ -523,6 +554,9 @@ void* GetNCCLDsoHandle() {
 #elif defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL)
   return GetDsoHandleFromSearchPath(
       FLAGS_rccl_dir, "librccl.so", true, {}, warning_msg);
+#elif defined(PADDLE_WITH_MUSA) && defined(PADDLE_WITH_MCCL)
+  return GetDsoHandleFromSearchPath(
+      FLAGS_mccl_dir, "libmccl.so", true, {}, warning_msg);
 #else
   return GetDsoHandleFromSearchPath(
       FLAGS_nccl_dir, "libnccl.so", true, {}, warning_msg);
diff --git a/paddle/phi/backends/dynload/mublas.cc b/paddle/phi/backends/dynload/mublas.cc
new file mode 100644
index 0000000000000..5952fb0f2f422
--- /dev/null
+++ b/paddle/phi/backends/dynload/mublas.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/backends/dynload/mublas.h"
+
+namespace phi {
+namespace dynload {
+std::once_flag mublas_dso_flag;
+void *mublas_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+MUBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace phi
+
diff --git a/paddle/phi/backends/dynload/mublas.h b/paddle/phi/backends/dynload/mublas.h
index bbba96fa497a2..5782498ed951f 100644
--- a/paddle/phi/backends/dynload/mublas.h
+++ b/paddle/phi/backends/dynload/mublas.h
@@ -1,6 +1,92 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
 
 #include <mublas.h>
+#include <musa.h>
+
+#include <mutex>  // NOLINT
+#include <type_traits>
+
+#include "paddle/phi/backends/dynload/dynamic_loader.h"
+#include "paddle/phi/backends/dynload/port.h"
+
 namespace phi {
 namespace dynload {
+
+extern std::once_flag mublas_dso_flag;
+extern void *mublas_dso_handle;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load mublas routine
+ * via operator overloading.
+ *
+ * note: default dynamic linked libs
+ */
+#define DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP(__name)                            \
+  struct DynLoad__##__name {                                                \
+    template <typename... Args>                                             \
+    inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+      using mublas_func =                                                   \
+          decltype(::__name(std::declval<Args>()...)) (*)(Args...);         \
+      std::call_once(mublas_dso_flag, []() {                                \
+        mublas_dso_handle = phi::dynload::GetCublasDsoHandle();             \
+      });                                                                   \
+      static void *p_##__name = dlsym(mublas_dso_handle, #__name);          \
+      return reinterpret_cast<mublas_func>(p_##__name)(args...);            \
+    }                                                                       \
+  };                                                                        \
+  extern DynLoad__##__name __name
+
+#define MUBLAS_BLAS_ROUTINE_EACH(__macro) \
+  __macro(mublasSaxpy);                \
+  __macro(mublasDaxpy);                \
+  __macro(mublasCaxpy);                \
+  __macro(mublasZaxpy);                \
+  __macro(mublasSscal);                \
+  __macro(mublasDscal);                \
+  __macro(mublasScopy);                \
+  __macro(mublasDcopy);                \
+  __macro(mublasSgemv);                \
+  __macro(mublasDgemv);                \
+  __macro(mublasCgemv);                \
+  __macro(mublasZgemv);                \
+  __macro(mublasSgemm);                \
+  __macro(mublasDgemm);                \
+  __macro(mublasCgemm);                \
+  __macro(mublasZgemm);                \
+  __macro(mublasHgemm);                \
+  __macro(mublasSgeam);                \
+  __macro(mublasDgeam);                \
+  __macro(mublasDtrsm);                \
+  __macro(mublasCtrsm);                \
+  __macro(mublasZtrsm);                \
+  __macro(mublasCreate);               \
+  __macro(mublasDestroy);              \
+  __macro(mublasSetStream);            \
+  __macro(mublasSetPointerMode);       \
+  __macro(mublasGetPointerMode);       \
+  __macro(mublasSgemmBatched);         \
+  __macro(mublasDgemmBatched);         \
+  __macro(mublasCgemmBatched);         \
+  __macro(mublasZgemmBatched);
+
+
+MUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP)
+
+#undef DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP
 }  // namespace dynload
 }  // namespace phi
diff --git a/paddle/phi/backends/dynload/musa_driver.cc b/paddle/phi/backends/dynload/musa_driver.cc
index 009dda42ceebf..a7690ac72bbde 100644
--- a/paddle/phi/backends/dynload/musa_driver.cc
+++ b/paddle/phi/backends/dynload/musa_driver.cc
@@ -12,11 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/phi/backends/dynload/musa_driver.h"
+
 namespace phi {
 namespace dynload {
 
+std::once_flag musa_dso_flag;
+void* musa_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+MUSA_ROUTINE_EACH(DEFINE_WRAP);
+
 bool HasCUDADriver() {
-  return false;
+  std::call_once(musa_dso_flag, []() { musa_dso_handle = GetCUDADsoHandle(); });
+  return musa_dso_handle != nullptr;
 }
 
 }  // namespace dynload
diff --git a/paddle/phi/backends/dynload/musa_driver.h b/paddle/phi/backends/dynload/musa_driver.h
index 1363d135d5f7e..69ce81ae99bf1 100644
--- a/paddle/phi/backends/dynload/musa_driver.h
+++ b/paddle/phi/backends/dynload/musa_driver.h
@@ -14,11 +14,57 @@ limitations under the License. */
 
 #pragma once
 
+#include <musa.h>
+
+#include <mutex>  // NOLINT
+
+#include "paddle/phi/backends/dynload/dynamic_loader.h"
+#include "paddle/phi/backends/dynload/port.h"
+
 namespace phi {
 namespace dynload {
 
+extern std::once_flag musa_dso_flag;
+extern void* musa_dso_handle;
 extern bool HasCUDADriver();
 
+#define DECLARE_DYNAMIC_LOAD_MUSA_WRAP(__name)                       \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+      using musa_func = decltype(&::__name);                         \
+      std::call_once(musa_dso_flag, []() {                           \
+        musa_dso_handle = phi::dynload::GetCUDADsoHandle();          \
+      });                                                            \
+      static void* p_##__name = dlsym(musa_dso_handle, #__name);     \
+      return reinterpret_cast<musa_func>(p_##__name)(args...);       \
+    }                                                                \
+  };                                                                 \
+  extern struct DynLoad__##__name __name
+
+/**
+ * include all needed musa driver functions
+ **/
+#define MUSA_ROUTINE_EACH(__macro)                      \
+  __macro(muInit);                                      \
+  __macro(muDriverGetVersion);                          \
+  __macro(muGetErrorString);                            \
+  __macro(muModuleLoadData);                            \
+  __macro(muModuleGetFunction);                         \
+  __macro(muModuleUnload);                              \
+  __macro(muOccupancyMaxActiveBlocksPerMultiprocessor); \
+  __macro(muLaunchKernel);                              \
+  __macro(muCtxCreate);                                 \
+  __macro(muCtxGetCurrent);                             \
+  __macro(muDeviceGetCount);                            \
+  __macro(muDevicePrimaryCtxGetState);                  \
+  __macro(muDeviceGetAttribute);                        \
+  __macro(muDeviceGet);
+
+
+MUSA_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MUSA_WRAP);
+
+#undef DECLARE_DYNAMIC_LOAD_MUSA_WRAP
+
 }  // namespace dynload
 }  // namespace phi
-
diff --git a/paddle/phi/backends/gpu/gpu_resources.cc b/paddle/phi/backends/gpu/gpu_resources.cc
index 9846cbdb992c3..72cedc993f5ce 100644
--- a/paddle/phi/backends/gpu/gpu_resources.cc
+++ b/paddle/phi/backends/gpu/gpu_resources.cc
@@ -268,9 +268,9 @@ void InitBlasHandle(blasHandle_t* blas_handle, gpuStream_t stream) {
   phi::dynload::rocblas_create_handle(blas_handle);
   phi::dynload::rocblas_set_stream(*blas_handle, stream);
 #elif defined(PADDLE_WITH_MUSA)
-  PADDLE_RETRY_CUDA_SUCCESS(mublasCreate(blas_handle));
+  PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::mublasCreate(blas_handle));
   PADDLE_RETRY_CUDA_SUCCESS(
-      mublasSetStream(*blas_handle, stream));
+      phi::dynload::mublasSetStream(*blas_handle, stream));
 #else   // PADDLE_WITH_MUSA
   PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasCreate(blas_handle));
   PADDLE_RETRY_CUDA_SUCCESS(
@@ -286,7 +286,7 @@ void DestroyBlasHandle(blasHandle_t handle) {
   }
 #elif defined(PADDLE_WITH_MUSA)
   if (handle != nullptr) {
-    mublasDestroy(handle);
+    phi::dynload::mublasDestroy(handle);
     handle = nullptr;
   }
 #else

From 903c8f7da41917fe7419f24597f81d13073a0a65 Mon Sep 17 00:00:00 2001
From: "zhi.cai" <zhi.cai@mthreads.com>
Date: Mon, 7 Aug 2023 00:19:00 +0800
Subject: [PATCH 29/55] [MTAI-484] feat(build): enable mccl/mublas/murand in
 dynload system

---
 paddle/fluid/platform/dynload/CMakeLists.txt |  7 ++
 paddle/fluid/platform/dynload/mccl.cc        | 27 ++++++++
 paddle/fluid/platform/dynload/mccl.h         | 57 +++++++++++++++
 paddle/fluid/platform/dynload/murand.cc      | 27 ++++++++
 paddle/fluid/platform/dynload/murand.h       | 43 ++++++++++++
 paddle/fluid/platform/dynload/musparse.cc    | 29 ++++++++
 paddle/fluid/platform/dynload/musparse.h     | 61 ++++++++++++++++
 paddle/phi/backends/dynload/CMakeLists.txt   |  5 +-
 paddle/phi/backends/dynload/mccl.cc          | 28 ++++++++
 paddle/phi/backends/dynload/mccl.h           | 69 ++++++++++++++++++
 paddle/phi/backends/dynload/murand.cc        | 28 ++++++++
 paddle/phi/backends/dynload/murand.h         | 54 +++++++++++++++
 paddle/phi/backends/dynload/musparse.cc      | 28 ++++++++
 paddle/phi/backends/dynload/musparse.h       | 73 ++++++++++++++++++++
 paddle/phi/backends/gpu/forwards.h           |  2 +-
 paddle/phi/core/enforce.h                    |  4 +-
 16 files changed, 538 insertions(+), 4 deletions(-)
 create mode 100644 paddle/fluid/platform/dynload/mccl.cc
 create mode 100644 paddle/fluid/platform/dynload/mccl.h
 create mode 100644 paddle/fluid/platform/dynload/murand.cc
 create mode 100644 paddle/fluid/platform/dynload/murand.h
 create mode 100644 paddle/fluid/platform/dynload/musparse.cc
 create mode 100644 paddle/phi/backends/dynload/mccl.cc
 create mode 100644 paddle/phi/backends/dynload/mccl.h
 create mode 100644 paddle/phi/backends/dynload/murand.cc
 create mode 100644 paddle/phi/backends/dynload/musparse.cc

diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt
index beac4eb9261a0..d3d61e2179abf 100644
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -23,6 +23,10 @@ if(WITH_ROCM)
   list(APPEND HIP_SRCS rocblas.cc miopen.cc hiprand.cc hipfft.cc)
 endif()
 
+if(WITH_MUSA)
+  list(APPEND MUSA_SRCS mublas.cc murand.cc musparse.cc)
+endif()
+
 # There is no macOS version of NCCL.
 # Disable nvrtc and cuda_driver api on MacOS, and only do a early test on Linux and Windows.
 if(NOT APPLE)
@@ -41,6 +45,9 @@ if(NOT APPLE)
   endif()
   if(WITH_MUSA)
     list(APPEND MUSA_SRCS musa_driver.cc musartc.cc)
+    if(WITH_MCCL)
+      list(APPEND MUSA_SRCS mccl.cc)
+    endif()
   endif()
 endif()
 
diff --git a/paddle/fluid/platform/dynload/mccl.cc b/paddle/fluid/platform/dynload/mccl.cc
new file mode 100644
index 0000000000000..ea5df00912dd4
--- /dev/null
+++ b/paddle/fluid/platform/dynload/mccl.cc
@@ -0,0 +1,27 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/dynload/mccl.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+MCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/mccl.h b/paddle/fluid/platform/dynload/mccl.h
new file mode 100644
index 0000000000000..2f22f65d699d6
--- /dev/null
+++ b/paddle/fluid/platform/dynload/mccl.h
@@ -0,0 +1,57 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <mccl.h>
+
+#include <mutex>  // NOLINT
+
+#include "paddle/phi/backends/dynload/mccl.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_MCCL_WRAP(__name)      \
+  using DynLoad__##__name = phi::dynload::DynLoad__##__name; \
+  extern DynLoad__##__name __name
+
+#define MCCL_RAND_ROUTINE_EACH(__macro) \
+  __macro(mcclCommInitAll);             \
+  __macro(mcclGetUniqueId);             \
+  __macro(mcclCommInitRank);            \
+  __macro(mcclCommDestroy);             \
+  __macro(mcclCommCount);               \
+  __macro(mcclCommCuDevice);            \
+  __macro(mcclCommUserRank);            \
+  __macro(mcclAllReduce);               \
+  __macro(mcclBcast);                   \
+  __macro(mcclAllGather);               \
+  __macro(mcclGroupStart);              \
+  __macro(mcclGroupEnd);                \
+  __macro(mcclReduce);                  \
+  __macro(mcclReduceScatter);           \
+  __macro(mcclGetErrorString);          \
+  __macro(mcclBroadcast);               \
+  __macro(mcclGetVersion);              \
+  __macro(mcclSend);                    \
+  __macro(mcclRecv);                    \
+  __macro(mcclRedOpCreatePreMulSum);    \
+  __macro(mcclRedOpDestroy);
+
+MCCL_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MCCL_WRAP)
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/murand.cc b/paddle/fluid/platform/dynload/murand.cc
new file mode 100644
index 0000000000000..d1af076066117
--- /dev/null
+++ b/paddle/fluid/platform/dynload/murand.cc
@@ -0,0 +1,27 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/dynload/murand.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+MURAND_RAND_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/murand.h b/paddle/fluid/platform/dynload/murand.h
new file mode 100644
index 0000000000000..cf8ecf51595e0
--- /dev/null
+++ b/paddle/fluid/platform/dynload/murand.h
@@ -0,0 +1,43 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <murand.h>
+
+#include <mutex>  // NOLINT
+
+#include "paddle/phi/backends/dynload/murand.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_MURAND_WRAP(__name)    \
+  using DynLoad__##__name = phi::dynload::DynLoad__##__name; \
+  extern DynLoad__##__name __name
+
+#define MURAND_RAND_ROUTINE_EACH(__macro)      \
+  __macro(murandCreateGenerator);              \
+  __macro(murandSetStream);                    \
+  __macro(murandSetPseudoRandomGeneratorSeed); \
+  __macro(murandGenerateUniform);              \
+  __macro(murandGenerateUniformDouble);        \
+  __macro(murandGenerateNormal);               \
+  __macro(murandDestroyGenerator);
+
+MURAND_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MURAND_WRAP);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/musparse.cc b/paddle/fluid/platform/dynload/musparse.cc
new file mode 100644
index 0000000000000..b0e8dbb58d569
--- /dev/null
+++ b/paddle/fluid/platform/dynload/musparse.cc
@@ -0,0 +1,29 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/dynload/musparse.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+#ifdef MUSPARSE_ROUTINE_EACH
+MUSPARSE_ROUTINE_EACH(DEFINE_WRAP);
+#endif
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/musparse.h b/paddle/fluid/platform/dynload/musparse.h
index e69de29bb2d1d..7669446ed1025 100644
--- a/paddle/fluid/platform/dynload/musparse.h
+++ b/paddle/fluid/platform/dynload/musparse.h
@@ -0,0 +1,61 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <musa.h>
+#include <musparse.h>
+
+#include <mutex>  // NOLINT
+
+#include "paddle/phi/backends/dynload/musparse.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_MUSPARSE_WRAP(__name)  \
+  using DynLoad__##__name = phi::dynload::DynLoad__##__name; \
+  extern DynLoad__##__name __name
+
+#if defined(PADDLE_WITH_MUSA)
+#define MUSPARSE_ROUTINE_EACH(__macro)    \
+  __macro(musparseSetStream);             \
+  __macro(musparseCreateMatDescr);        \
+  __macro(musparseSnnz);                  \
+  __macro(musparseDnnz);                  \
+  __macro(musparseSetMatType);            \
+  __macro(musparseSetMatIndexBase);       \
+  __macro(musparseCreateCsr);             \
+  __macro(musparseCreateCoo);             \
+  __macro(musparseCreateDnMat);           \
+  __macro(musparseCreateDnVec);           \
+  __macro(musparseSpMM);                  \
+  __macro(musparseDestroySpMat);          \
+  __macro(musparseDestroyDnMat);          \
+  __macro(musparseDestroyDnVec);          \
+  __macro(musparseSpMV);                  \
+  __macro(musparseSDDMM_bufferSize);      \
+  __macro(musparseSDDMM_preprocess);      \
+  __macro(musparseSDDMM);                 \
+  __macro(musparseDnMatSetStridedBatch);  \
+  __macro(musparseCooSetStridedBatch);    \
+  __macro(musparseCsrSetStridedBatch);
+
+MUSPARSE_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MUSPARSE_WRAP)
+#endif  // PADDLE_WITH_MUSA
+
+#undef PLATFORM_DECLARE_DYNAMIC_LOAD_MUSPARSE_WRAP
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/phi/backends/dynload/CMakeLists.txt b/paddle/phi/backends/dynload/CMakeLists.txt
index 5e481e6d59aa3..b57c5d096fb2c 100644
--- a/paddle/phi/backends/dynload/CMakeLists.txt
+++ b/paddle/phi/backends/dynload/CMakeLists.txt
@@ -35,7 +35,10 @@ if(WITH_MUSA)
     APPEND
     MUSA_SRCS
     mudnn.cc
-    mublas.cc)
+    mublas.cc
+    musparse.cc
+    murand.cc
+    mccl.cc)
 endif()
 
 # There is no macOS version of NCCL.
diff --git a/paddle/phi/backends/dynload/mccl.cc b/paddle/phi/backends/dynload/mccl.cc
new file mode 100644
index 0000000000000..d6f0208780de8
--- /dev/null
+++ b/paddle/phi/backends/dynload/mccl.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/backends/dynload/mccl.h"
+
+namespace phi {
+namespace dynload {
+
+std::once_flag mccl_dso_flag;
+void *mccl_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+MCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace phi
diff --git a/paddle/phi/backends/dynload/mccl.h b/paddle/phi/backends/dynload/mccl.h
new file mode 100644
index 0000000000000..19ab0246f99d7
--- /dev/null
+++ b/paddle/phi/backends/dynload/mccl.h
@@ -0,0 +1,69 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <mccl.h>
+
+#include <mutex>  // NOLINT
+
+#include "paddle/phi/backends/dynload/dynamic_loader.h"
+#include "paddle/phi/backends/dynload/port.h"
+
+namespace phi {
+namespace dynload {
+
+extern std::once_flag mccl_dso_flag;
+extern void* mccl_dso_handle;
+
+#define DECLARE_DYNAMIC_LOAD_MCCL_WRAP(__name)                   \
+  struct DynLoad__##__name {                                     \
+    template <typename... Args>                                  \
+    auto operator()(Args... args) -> decltype(__name(args...)) { \
+      using mccl_func = decltype(&::__name);                     \
+      std::call_once(mccl_dso_flag, []() {                       \
+        mccl_dso_handle = phi::dynload::GetNCCLDsoHandle();      \
+      });                                                        \
+      static void* p_##__name = dlsym(mccl_dso_handle, #__name); \
+      return reinterpret_cast<mccl_func>(p_##__name)(args...);   \
+    }                                                            \
+  };                                                             \
+  extern DynLoad__##__name __name
+
+#define MCCL_RAND_ROUTINE_EACH(__macro) \
+  __macro(mcclCommInitAll);             \
+  __macro(mcclGetUniqueId);             \
+  __macro(mcclCommInitRank);            \
+  __macro(mcclCommDestroy);             \
+  __macro(mcclCommCount);               \
+  __macro(mcclCommCuDevice);            \
+  __macro(mcclCommUserRank);            \
+  __macro(mcclAllReduce);               \
+  __macro(mcclBcast);                   \
+  __macro(mcclAllGather);               \
+  __macro(mcclGroupStart);              \
+  __macro(mcclGroupEnd);                \
+  __macro(mcclReduce);                  \
+  __macro(mcclReduceScatter);           \
+  __macro(mcclGetErrorString);          \
+  __macro(mcclBroadcast);               \
+  __macro(mcclGetVersion);              \
+  __macro(mcclSend);                    \
+  __macro(mcclRecv);                    \
+  __macro(mcclRedOpCreatePreMulSum);    \
+  __macro(mcclRedOpDestroy);
+
+MCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MCCL_WRAP)
+
+}  // namespace dynload
+}  // namespace phi
diff --git a/paddle/phi/backends/dynload/murand.cc b/paddle/phi/backends/dynload/murand.cc
new file mode 100644
index 0000000000000..bd88319b0d524
--- /dev/null
+++ b/paddle/phi/backends/dynload/murand.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/backends/dynload/murand.h"
+
+namespace phi {
+namespace dynload {
+
+std::once_flag murand_dso_flag;
+void *murand_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+MURAND_RAND_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace phi
diff --git a/paddle/phi/backends/dynload/murand.h b/paddle/phi/backends/dynload/murand.h
index e69de29bb2d1d..64aa082b5a1b8 100644
--- a/paddle/phi/backends/dynload/murand.h
+++ b/paddle/phi/backends/dynload/murand.h
@@ -0,0 +1,54 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <murand.h>
+
+#include <mutex>  // NOLINT
+
+#include "paddle/phi/backends/dynload/dynamic_loader.h"
+#include "paddle/phi/backends/dynload/port.h"
+
+namespace phi {
+namespace dynload {
+extern std::once_flag murand_dso_flag;
+extern void *murand_dso_handle;
+
+#define DECLARE_DYNAMIC_LOAD_MURAND_WRAP(__name)                   \
+  struct DynLoad__##__name {                                       \
+    template <typename... Args>                                    \
+    murandStatus_t operator()(Args... args) {                      \
+      using murandFunc = decltype(&::__name);                      \
+      std::call_once(murand_dso_flag, []() {                       \
+        murand_dso_handle = phi::dynload::GetCurandDsoHandle();    \
+      });                                                          \
+      static void *p_##__name = dlsym(murand_dso_handle, #__name); \
+      return reinterpret_cast<murandFunc>(p_##__name)(args...);    \
+    }                                                              \
+  };                                                               \
+  extern DynLoad__##__name __name
+
+#define MURAND_RAND_ROUTINE_EACH(__macro)      \
+  __macro(murandCreateGenerator);              \
+  __macro(murandSetStream);                    \
+  __macro(murandSetPseudoRandomGeneratorSeed); \
+  __macro(murandGenerateUniform);              \
+  __macro(murandGenerateUniformDouble);        \
+  __macro(murandGenerateNormal);               \
+  __macro(murandDestroyGenerator);
+
+MURAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MURAND_WRAP);
+
+}  // namespace dynload
+}  // namespace phi
diff --git a/paddle/phi/backends/dynload/musparse.cc b/paddle/phi/backends/dynload/musparse.cc
new file mode 100644
index 0000000000000..35ccd602e63ba
--- /dev/null
+++ b/paddle/phi/backends/dynload/musparse.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/backends/dynload/musparse.h"
+
+namespace phi {
+namespace dynload {
+
+std::once_flag musparse_dso_flag;
+void *musparse_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+MUSPARSE_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace phi
diff --git a/paddle/phi/backends/dynload/musparse.h b/paddle/phi/backends/dynload/musparse.h
index e69de29bb2d1d..428cb938431f3 100644
--- a/paddle/phi/backends/dynload/musparse.h
+++ b/paddle/phi/backends/dynload/musparse.h
@@ -0,0 +1,73 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <musa.h>
+#include <musparse.h>
+
+#include <mutex>  // NOLINT
+
+#include "paddle/phi/backends/dynload/dynamic_loader.h"
+#include "paddle/phi/backends/dynload/port.h"
+
+namespace phi {
+namespace dynload {
+extern std::once_flag musparse_dso_flag;
+extern void *musparse_dso_handle;
+
+#define DECLARE_DYNAMIC_LOAD_MUSPARSE_WRAP(__name)                   \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    musparseStatus_t operator()(Args... args) {                      \
+      using Func = decltype(&::__name);                              \
+      std::call_once(musparse_dso_flag, []() {                       \
+        musparse_dso_handle = phi::dynload::GetCusparseDsoHandle();  \
+      });                                                            \
+      static void *p_##__name = dlsym(musparse_dso_handle, #__name); \
+      return reinterpret_cast<Func>(p_##__name)(args...);            \
+    }                                                                \
+  };                                                                 \
+  extern DynLoad__##__name __name
+
+#if defined(PADDLE_WITH_MUSA)
+#define MUSPARSE_ROUTINE_EACH(__macro)    \
+  __macro(musparseSetStream);             \
+  __macro(musparseCreateMatDescr);        \
+  __macro(musparseSnnz);                  \
+  __macro(musparseDnnz);                  \
+  __macro(musparseSetMatType);            \
+  __macro(musparseSetMatIndexBase);       \
+  __macro(musparseCreateCsr);             \
+  __macro(musparseCreateCoo);             \
+  __macro(musparseCreateDnMat);           \
+  __macro(musparseCreateDnVec);           \
+  __macro(musparseSpMM);                  \
+  __macro(musparseDestroySpMat);          \
+  __macro(musparseDestroyDnMat);          \
+  __macro(musparseDestroyDnVec);          \
+  __macro(musparseSpMV);                  \
+  __macro(musparseSDDMM_bufferSize);      \
+  __macro(musparseSDDMM_preprocess);      \
+  __macro(musparseSDDMM);                 \
+  __macro(musparseDnMatSetStridedBatch);  \
+  __macro(musparseCooSetStridedBatch);    \
+  __macro(musparseCsrSetStridedBatch);
+
+MUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MUSPARSE_WRAP)
+
+#endif  // PADDLE_WITH_MUSA
+
+#undef DECLARE_DYNAMIC_LOAD_MUSPARSE_WRAP
+}  // namespace dynload
+}  // namespace phi
diff --git a/paddle/phi/backends/gpu/forwards.h b/paddle/phi/backends/gpu/forwards.h
index 244de0a9dabf4..0fc6b703e162d 100644
--- a/paddle/phi/backends/gpu/forwards.h
+++ b/paddle/phi/backends/gpu/forwards.h
@@ -80,7 +80,7 @@ using mudnnHandle_t = class Handle*;
 // TODO(@caizhi): using correct type
 using musolverDnHandle_t = bool**;
 using mublasLtHandle_t = struct _mublasHandle_t*;
-using musparseHandle_t = bool**;
+using musparseHandle_t = struct _musparse_handle*;
 
 /// Forward declaration of ROCM types.
 #include <cstddef>
diff --git a/paddle/phi/core/enforce.h b/paddle/phi/core/enforce.h
index f07a763ac52d0..fac512ea1c31a 100644
--- a/paddle/phi/core/enforce.h
+++ b/paddle/phi/core/enforce.h
@@ -39,8 +39,8 @@ limitations under the License. */
 #include <mublas.h>
 #include <mudnn.h>
 #include <mufft.h>
-//#include <murand.h>
-//#include <musparse.h>
+#include <murand.h>
+#include <musparse.h>
 #include <thrust/system/musa/error.h>
 #include <thrust/system_error.h>
 #endif  // PADDLE_WITH_MUSA

From 89bbe88075de0d7058a3688a948410f27809546e Mon Sep 17 00:00:00 2001
From: "zhi.cai" <zhi.cai@mthreads.com>
Date: Tue, 8 Aug 2023 16:16:27 +0800
Subject: [PATCH 30/55] [MTAI-484] fix(build): fix a compiling bug in
 musa_device_function.h

---
 paddle/fluid/framework/details/op_handle_base.cc      | 2 +-
 paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc | 1 +
 paddle/phi/backends/gpu/musa/musa_device_function.h   | 4 ++--
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index db539c0fbdaf2..663f5781ff62a 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -34,7 +34,7 @@ OpHandleBase::~OpHandleBase() PADDLE_MAY_THROW {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   for (auto &ev : events_) {
     if (ev.second) {
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(ev.second));
 #elif defined(PADDLE_WITH_MUSA)
       PADDLE_ENFORCE_GPU_SUCCESS(musaEventDestroy(ev.second));
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
index e62b79e502f94..7376e87155187 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
@@ -11,6 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+
 #include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h"
 
 #include <sstream>
diff --git a/paddle/phi/backends/gpu/musa/musa_device_function.h b/paddle/phi/backends/gpu/musa/musa_device_function.h
index f6131fb1e53d6..3f0c6b6726849 100644
--- a/paddle/phi/backends/gpu/musa/musa_device_function.h
+++ b/paddle/phi/backends/gpu/musa/musa_device_function.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#define PADDLE_CUDA_FP16
 // NOTE(): support float16 to half in header file.
-#define PADDLE_MUSA_FP16
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/common/float16.h"
@@ -66,7 +66,7 @@ __forceinline__ __device__ phi::dtype::float16 CudaShuffleDownSync(
 template <>
 __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleDownSync(
     unsigned mask, phi::dtype::bfloat16 val, int delta, int width) {
-#if defined(PADDLE_MUSA_BF16)
+#if defined(PADDLE_MUSA_BF16) && defined(__MUSA_ARCH__) && __MUSA_ARCH__ >= 220
   return phi::dtype::bfloat16(__shfl_down_sync(
       mask, val.to_mt_bfloat16(), static_cast<unsigned>(delta), width));
 #else

From 51284d9dd27f1701e48a538fd04457775be30b5b Mon Sep 17 00:00:00 2001
From: "zhi.cai" <zhi.cai@mthreads.com>
Date: Tue, 8 Aug 2023 16:24:46 +0800
Subject: [PATCH 31/55] [MTAI-484] feat(build): enable compiling all eigen/*.cu
 files

---
 paddle/phi/kernels/funcs/eigen/CMakeLists.txt | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/paddle/phi/kernels/funcs/eigen/CMakeLists.txt b/paddle/phi/kernels/funcs/eigen/CMakeLists.txt
index fe3189b44e318..30d6dc6013cf8 100644
--- a/paddle/phi/kernels/funcs/eigen/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/eigen/CMakeLists.txt
@@ -2,15 +2,9 @@ file(
   GLOB EIGEN_CC_SOURCES
   RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
   "*.cc")
-# TODO(@caizhi): compile all cu files
 file(
   GLOB EIGEN_CU_SOURCES
   RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
-  "s*.cu"
-  "p*.cu"
-  "b*.cu"
-  "c*.cu"
-  "e*.cu"
-  "l*.cu")
+  "*.cu")
 
 collect_srcs(kernels_srcs SRCS ${EIGEN_CC_SOURCES} ${EIGEN_CU_SOURCES})

From 25e7ee4a90363be9686a37ecea69a8b1eb19e249 Mon Sep 17 00:00:00 2001
From: CaiZhi <zhi.cai@mthreads.com>
Date: Tue, 8 Aug 2023 20:17:52 +0800
Subject: [PATCH 32/55] [MTAI-484] fix(build): fix CudaAtomicAdd bug

---
 cmake/musa.cmake                              |   3 +
 paddle/phi/backends/gpu/gpu_primitives.h      | 220 +++++++++---------
 paddle/phi/common/float16.h                   |  15 +-
 paddle/phi/common/scalar.h                    |  16 +-
 paddle/phi/core/visit_type.h                  |   8 +
 .../kernels/funcs/gather_scatter_functor.cu   |   3 +-
 paddle/phi/kernels/funcs/im2col.cu            |   3 +-
 paddle/phi/kernels/funcs/scatter.cu.h         |   6 +-
 paddle/phi/kernels/funcs/segment_pooling.cu   |  27 +--
 .../kernels/funcs/selected_rows_functor.cu    |   9 +-
 10 files changed, 153 insertions(+), 157 deletions(-)

diff --git a/cmake/musa.cmake b/cmake/musa.cmake
index fddc1855e87ac..ce69e242f07b4 100644
--- a/cmake/musa.cmake
+++ b/cmake/musa.cmake
@@ -30,6 +30,9 @@ endif()
 
 list(APPEND MUSA_MCC_FLAGS --cuda-gpu-arch=mp_21)
 list(APPEND MUSA_MCC_FLAGS -U__CUDA__)
+# MUSA has compile conflicts of float16.h as platform::float16 overload std::is_floating_point and std::is_integer
+list(APPEND MUSA_MCC_FLAGS -D__MUSA_NO_HALF_CONVERSIONS__)
+
 #set(MUSA_VERBOSE_BUILD ON)
 if(CMAKE_BUILD_TYPE MATCHES Debug)
   list(APPEND MUSA_MCC_FLAGS -g2)
diff --git a/paddle/phi/backends/gpu/gpu_primitives.h b/paddle/phi/backends/gpu/gpu_primitives.h
index b891644679264..e5d252d4ff89b 100644
--- a/paddle/phi/backends/gpu/gpu_primitives.h
+++ b/paddle/phi/backends/gpu/gpu_primitives.h
@@ -266,54 +266,54 @@ CUDA_ATOMIC_WRAPPER(Add, phi::dtype::bfloat16) {
                                     PDBF16ToCUDABF16(val)));
 }
 #else
-//CUDA_ATOMIC_WRAPPER(Add, phi::dtype::bfloat16) {
-//  // concrete packed bfloat16 value may exsits in lower or higher 16bits
-//  // of the 32bits address.
-//  uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
-//      reinterpret_cast<char *>(address) -
-//      (reinterpret_cast<uintptr_t>(address) & 0x02));
-//  float val_f = static_cast<float>(val);
-//  uint32_t old = *address_as_ui;
-//  uint32_t sum;
-//  uint32_t newval;
-//  uint32_t assumed;
-//  if (((uintptr_t)address & 0x02) == 0) {
-//    // the bfloat16 value stay at lower 16 bits of the address.
-//    do {
-//      assumed = old;
-//      old = atomicCAS(
-//          address_as_ui, assumed, bf16_add_to_low_half(assumed, val_f));
-//    } while (old != assumed);
-//    phi::dtype::bfloat16 ret;
-//    ret.x = old & 0xFFFFu;
-//    return ret;
-//  } else {
-//    // the bfloat16 value stay at higher 16 bits of the address.
-//    do {
-//      assumed = old;
-//      old = atomicCAS(
-//          address_as_ui, assumed, bf16_add_to_high_half(assumed, val_f));
-//    } while (old != assumed);
-//    phi::dtype::bfloat16 ret;
-//    ret.x = old >> 16;
-//    return ret;
-//  }
-//}
+CUDA_ATOMIC_WRAPPER(Add, phi::dtype::bfloat16) {
+  // concrete packed bfloat16 value may exsits in lower or higher 16bits
+  // of the 32bits address.
+  uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
+      reinterpret_cast<char *>(address) -
+      (reinterpret_cast<uintptr_t>(address) & 0x02));
+  float val_f = static_cast<float>(val);
+  uint32_t old = *address_as_ui;
+  uint32_t sum;
+  uint32_t newval;
+  uint32_t assumed;
+  if (((uintptr_t)address & 0x02) == 0) {
+    // the bfloat16 value stay at lower 16 bits of the address.
+    do {
+      assumed = old;
+      old = atomicCAS(
+          address_as_ui, assumed, bf16_add_to_low_half(assumed, val_f));
+    } while (old != assumed);
+    phi::dtype::bfloat16 ret;
+    ret.x = old & 0xFFFFu;
+    return ret;
+  } else {
+    // the bfloat16 value stay at higher 16 bits of the address.
+    do {
+      assumed = old;
+      old = atomicCAS(
+          address_as_ui, assumed, bf16_add_to_high_half(assumed, val_f));
+    } while (old != assumed);
+    phi::dtype::bfloat16 ret;
+    ret.x = old >> 16;
+    return ret;
+  }
+}
 #endif
 
-//CUDA_ATOMIC_WRAPPER(Add, complex<float>) {
-//  float *real = reinterpret_cast<float *>(address);
-//  float *imag = real + 1;
-//  return complex<float>(CudaAtomicAdd(real, val.real),
-//                        CudaAtomicAdd(imag, val.imag));
-//}
-//
-//CUDA_ATOMIC_WRAPPER(Add, complex<double>) {
-//  double *real = reinterpret_cast<double *>(address);
-//  double *imag = real + 1;
-//  return complex<double>(CudaAtomicAdd(real, val.real),
-//                         CudaAtomicAdd(imag, val.imag));
-//}
+CUDA_ATOMIC_WRAPPER(Add, complex<float>) {
+  float *real = reinterpret_cast<float *>(address);
+  float *imag = real + 1;
+  return complex<float>(CudaAtomicAdd(real, val.real),
+                        CudaAtomicAdd(imag, val.imag));
+}
+
+CUDA_ATOMIC_WRAPPER(Add, complex<double>) {
+  double *real = reinterpret_cast<double *>(address);
+  double *imag = real + 1;
+  return complex<double>(CudaAtomicAdd(real, val.real),
+                         CudaAtomicAdd(imag, val.imag));
+}
 
 // For atomicMax
 USE_CUDA_ATOMIC(Max, int);
@@ -470,38 +470,38 @@ inline static __device__ uint32_t bf16_max_to_high_half(uint32_t val, float x) {
   //return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
 }
 
-//CUDA_ATOMIC_WRAPPER(Max, phi::dtype::bfloat16) {
-//  if (*address >= val) {
-//    return *address;
-//  }
-//  uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
-//      reinterpret_cast<char *>(address) -
-//      (reinterpret_cast<uintptr_t>(address) & 0x02));
-//  float val_f = static_cast<float>(val);
-//  uint32_t old = *address_as_ui;
-//  uint32_t assumed;
-//  if (((uintptr_t)address & 0x02) == 0) {
-//    // The bfloat16 value stay at lower 16 bits of the address.
-//    do {
-//      assumed = old;
-//      old = atomicCAS(
-//          address_as_ui, assumed, bf16_max_to_low_half(assumed, val_f));
-//    } while (old != assumed);
-//    phi::dtype::bfloat16 ret;
-//    ret.x = old & 0xFFFFu;
-//    return ret;
-//  } else {
-//    // The bfloat16 value stay at higher 16 bits of the address.
-//    do {
-//      assumed = old;
-//      old = atomicCAS(
-//          address_as_ui, assumed, bf16_max_to_high_half(assumed, val_f));
-//    } while (old != assumed);
-//    phi::dtype::bfloat16 ret;
-//    ret.x = old >> 16;
-//    return ret;
-//  }
-//}
+CUDA_ATOMIC_WRAPPER(Max, phi::dtype::bfloat16) {
+  if (*address >= val) {
+    return *address;
+  }
+  uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
+      reinterpret_cast<char *>(address) -
+      (reinterpret_cast<uintptr_t>(address) & 0x02));
+  float val_f = static_cast<float>(val);
+  uint32_t old = *address_as_ui;
+  uint32_t assumed;
+  if (((uintptr_t)address & 0x02) == 0) {
+    // The bfloat16 value stay at lower 16 bits of the address.
+    do {
+      assumed = old;
+      old = atomicCAS(
+          address_as_ui, assumed, bf16_max_to_low_half(assumed, val_f));
+    } while (old != assumed);
+    phi::dtype::bfloat16 ret;
+    ret.x = old & 0xFFFFu;
+    return ret;
+  } else {
+    // The bfloat16 value stay at higher 16 bits of the address.
+    do {
+      assumed = old;
+      old = atomicCAS(
+          address_as_ui, assumed, bf16_max_to_high_half(assumed, val_f));
+    } while (old != assumed);
+    phi::dtype::bfloat16 ret;
+    ret.x = old >> 16;
+    return ret;
+  }
+}
 
 // For atomicMin
 USE_CUDA_ATOMIC(Min, int);
@@ -658,38 +658,38 @@ inline static __device__ uint32_t bf16_min_to_high_half(uint32_t val, float x) {
   //return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
 }
 
-//CUDA_ATOMIC_WRAPPER(Min, phi::dtype::bfloat16) {
-//  if (*address <= val) {
-//    return *address;
-//  }
-//  uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
-//      reinterpret_cast<char *>(address) -
-//      (reinterpret_cast<uintptr_t>(address) & 0x02));
-//  float val_f = static_cast<float>(val);
-//  uint32_t old = *address_as_ui;
-//  uint32_t assumed;
-//  if (((uintptr_t)address & 0x02) == 0) {
-//    // The bfloat16 value stay at lower 16 bits of the address.
-//    do {
-//      assumed = old;
-//      old = atomicCAS(
-//          address_as_ui, assumed, bf16_min_to_low_half(assumed, val_f));
-//    } while (old != assumed);
-//    phi::dtype::bfloat16 ret;
-//    ret.x = old & 0xFFFFu;
-//    return ret;
-//  } else {
-//    // The bfloat16 value stay at higher 16 bits of the address.
-//    do {
-//      assumed = old;
-//      old = atomicCAS(
-//          address_as_ui, assumed, bf16_min_to_high_half(assumed, val_f));
-//    } while (old != assumed);
-//    phi::dtype::bfloat16 ret;
-//    ret.x = old >> 16;
-//    return ret;
-//  }
-//}
+CUDA_ATOMIC_WRAPPER(Min, phi::dtype::bfloat16) {
+  if (*address <= val) {
+    return *address;
+  }
+  uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
+      reinterpret_cast<char *>(address) -
+      (reinterpret_cast<uintptr_t>(address) & 0x02));
+  float val_f = static_cast<float>(val);
+  uint32_t old = *address_as_ui;
+  uint32_t assumed;
+  if (((uintptr_t)address & 0x02) == 0) {
+    // The bfloat16 value stay at lower 16 bits of the address.
+    do {
+      assumed = old;
+      old = atomicCAS(
+          address_as_ui, assumed, bf16_min_to_low_half(assumed, val_f));
+    } while (old != assumed);
+    phi::dtype::bfloat16 ret;
+    ret.x = old & 0xFFFFu;
+    return ret;
+  } else {
+    // The bfloat16 value stay at higher 16 bits of the address.
+    do {
+      assumed = old;
+      old = atomicCAS(
+          address_as_ui, assumed, bf16_min_to_high_half(assumed, val_f));
+    } while (old != assumed);
+    phi::dtype::bfloat16 ret;
+    ret.x = old >> 16;
+    return ret;
+  }
+}
 
 #ifdef PADDLE_WITH_CUDA
 /*
diff --git a/paddle/phi/common/float16.h b/paddle/phi/common/float16.h
index 00de1bf605157..5b53828251e40 100644
--- a/paddle/phi/common/float16.h
+++ b/paddle/phi/common/float16.h
@@ -1019,13 +1019,14 @@ struct is_pod<phi::dtype::float16> {
                             is_standard_layout<phi::dtype::float16>::value;
 };
 
-//template <>
-//struct is_floating_point<phi::dtype::float16>
-//    : std::integral_constant<
-//          bool,
-//          std::is_same<
-//              phi::dtype::float16,
-//              typename std::remove_cv<phi::dtype::float16>::type>::value> {};
+template <>
+struct is_floating_point<phi::dtype::float16>
+    : std::integral_constant<
+          bool,
+          std::is_same<
+              phi::dtype::float16,
+              typename std::remove_cv<phi::dtype::float16>::type>::value> {};
+
 template <>
 struct is_signed<phi::dtype::float16> {
   static const bool value = true;
diff --git a/paddle/phi/common/scalar.h b/paddle/phi/common/scalar.h
index c8ced345a637a..4286dfcc1d0fa 100644
--- a/paddle/phi/common/scalar.h
+++ b/paddle/phi/common/scalar.h
@@ -140,10 +140,10 @@ class ScalarBase {
         return static_cast<RT>(data_.f32);
       case DataType::FLOAT64:
         return static_cast<RT>(data_.f64);
-      //case DataType::FLOAT16:
-      //  return static_cast<RT>(data_.f16);
-      //case DataType::BFLOAT16:
-      //  return static_cast<RT>(data_.bf16);
+      case DataType::FLOAT16:
+        return static_cast<RT>(data_.f16);
+      case DataType::BFLOAT16:
+        return static_cast<RT>(data_.bf16);
       case DataType::INT32:
         return static_cast<RT>(data_.i32);
       case DataType::INT64:
@@ -162,10 +162,10 @@ class ScalarBase {
         return static_cast<RT>(data_.ui8);
       case DataType::BOOL:
         return static_cast<RT>(data_.b);
-      //case DataType::COMPLEX64:
-      //  return static_cast<RT>(data_.c64);
-      //case DataType::COMPLEX128:
-      //  return static_cast<RT>(data_.c128);
+      case DataType::COMPLEX64:
+        return static_cast<RT>(data_.c64);
+      case DataType::COMPLEX128:
+        return static_cast<RT>(data_.c128);
       default:
         PD_THROW("Invalid enum scalar data type `", dtype_, "`.");
     }
diff --git a/paddle/phi/core/visit_type.h b/paddle/phi/core/visit_type.h
index d72046a82e0cb..f96fdb1f28b63 100644
--- a/paddle/phi/core/visit_type.h
+++ b/paddle/phi/core/visit_type.h
@@ -281,9 +281,17 @@ namespace phi {
       PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::INT16, int16_t, __VA_ARGS__) \
       PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::INT32, int32_t, __VA_ARGS__) \
       PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::INT64, int64_t, __VA_ARGS__) \
+      PD_PRIVATE_CASE_TYPE(                                                    \
+          NAME, ::phi::DataType::BFLOAT16, phi::bfloat16, __VA_ARGS__)         \
+      PD_PRIVATE_CASE_TYPE(                                                    \
+          NAME, ::phi::DataType::FLOAT16, phi::float16, __VA_ARGS__)           \
       PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::FLOAT32, float, __VA_ARGS__) \
       PD_PRIVATE_CASE_TYPE(                                                    \
           NAME, ::phi::DataType::FLOAT64, double, __VA_ARGS__)                 \
+      PD_PRIVATE_CASE_TYPE(                                                    \
+          NAME, ::phi::DataType::COMPLEX64, phi::complex64, __VA_ARGS__)       \
+      PD_PRIVATE_CASE_TYPE(                                                    \
+          NAME, ::phi::DataType::COMPLEX128, phi::complex128, __VA_ARGS__)     \
       default:                                                                 \
         PADDLE_THROW(phi::errors::InvalidArgument(                             \
             "Invalid enum data type `%d`.", static_cast<int>(__dtype__)));     \
diff --git a/paddle/phi/kernels/funcs/gather_scatter_functor.cu b/paddle/phi/kernels/funcs/gather_scatter_functor.cu
index edc647a968f63..b53de3beef9aa 100644
--- a/paddle/phi/kernels/funcs/gather_scatter_functor.cu
+++ b/paddle/phi/kernels/funcs/gather_scatter_functor.cu
@@ -34,8 +34,7 @@ class ReduceAdd {
       typename tensor_t,
       std::enable_if_t<!std::is_same<tensor_t, uint8_t>::value>* = nullptr>
   __device__ void operator()(tensor_t* self_data, tensor_t* src_data) const {
-    // TODO(@caizhi): enable cudaAtomicAdd
-    //phi::CudaAtomicAdd(self_data, *src_data);
+    phi::CudaAtomicAdd(self_data, *src_data);
   }
   template <typename tensor_t,
             std::enable_if_t<std::is_same<tensor_t, uint8_t>::value>* = nullptr>
diff --git a/paddle/phi/kernels/funcs/im2col.cu b/paddle/phi/kernels/funcs/im2col.cu
index a14d9886bb821..87c82adbb7fbe 100644
--- a/paddle/phi/kernels/funcs/im2col.cu
+++ b/paddle/phi/kernels/funcs/im2col.cu
@@ -472,8 +472,7 @@ __global__ void col2imOCF(const T* col_data,
 
         if (height_offset >= 0 && height_offset < im_height &&
             width_offset >= 0 && width_offset < im_width) {
-	  // TODO(@caizhi): compile CudaAtomicAdd
-          //phi::CudaAtomicAdd(im_data + im_offset, col_data[col_offset]);
+          phi::CudaAtomicAdd(im_data + im_offset, col_data[col_offset]);
         }
       }
     }
diff --git a/paddle/phi/kernels/funcs/scatter.cu.h b/paddle/phi/kernels/funcs/scatter.cu.h
index 9194b6dcc24d6..19a391ea150b6 100644
--- a/paddle/phi/kernels/funcs/scatter.cu.h
+++ b/paddle/phi/kernels/funcs/scatter.cu.h
@@ -76,8 +76,7 @@ __global__ void ScatterCUDAKernel(const T* params,
     if (overwrite) {
       *(output + out_i) = *(params + i);
     } else {
-      // TODO(@caizhi): enable compiling cudaAtomicAdd
-      //phi::CudaAtomicAdd(output + out_i, *(params + i));
+      phi::CudaAtomicAdd(output + out_i, *(params + i));
     }
   }
 }
@@ -111,8 +110,7 @@ __global__ void ScatterNdCUDAKernel(const T* update,
       temp *= output_dims[j];
     }
     int64_t output_i = gather_i + slice_i;
-    // TODO(@caizhi): enable compiling cudaAtomicAdd
-    //phi::CudaAtomicAdd(output + output_i, *(update + i));
+    phi::CudaAtomicAdd(output + output_i, *(update + i));
   }
 }
 
diff --git a/paddle/phi/kernels/funcs/segment_pooling.cu b/paddle/phi/kernels/funcs/segment_pooling.cu
index ef13af5b4eff5..0b6df55bdeff1 100644
--- a/paddle/phi/kernels/funcs/segment_pooling.cu
+++ b/paddle/phi/kernels/funcs/segment_pooling.cu
@@ -61,8 +61,7 @@ __global__ void SegmentSumIdsKernel(const Index* segment_ids,
         }
         if (j > 0) {
           if (last_segment_id == first_segment_id) {
-	    // TODO(@caizhi): enable compiling CudaAtomicAdd
-            //phi::CudaAtomicAdd(summed_ids + last_segment_id, sum);
+            phi::CudaAtomicAdd(summed_ids + last_segment_id, sum);
           } else {
             *(summed_ids + last_segment_id) = sum;
           }
@@ -72,7 +71,7 @@ __global__ void SegmentSumIdsKernel(const Index* segment_ids,
       sum += T(1);
       last_segment_id = current_segment_id;
     }
-    //phi::CudaAtomicAdd(summed_ids + last_segment_id, sum);
+    phi::CudaAtomicAdd(summed_ids + last_segment_id, sum);
   }
 }
 
@@ -113,9 +112,8 @@ __global__ void SegmentMeanKernel(const Index* segment_ids,
               last_segment_id * inner_dim_size + segment_offset;
 
           if (last_segment_id == first_segment_id) {
-	    // TODO(@caizhi): enable compiling CudaAtomicAdd
-            //phi::CudaAtomicAdd(output + output_index,
-            //                   sum / *(summed_ids + last_segment_id));
+            phi::CudaAtomicAdd(output + output_index,
+                               sum / *(summed_ids + last_segment_id));
           } else {
             *(output + output_index) = sum / *(summed_ids + last_segment_id);
           }
@@ -126,9 +124,8 @@ __global__ void SegmentMeanKernel(const Index* segment_ids,
       last_segment_id = current_segment_id;
     }
     Index output_index = last_segment_id * inner_dim_size + segment_offset;
-    // TODO(@caizhi): enable compiling CudaAtomicAdd
-    //phi::CudaAtomicAdd(output + output_index,
-    //                   sum / *(summed_ids + last_segment_id));
+    phi::CudaAtomicAdd(output + output_index,
+                       sum / *(summed_ids + last_segment_id));
   }
 }
 
@@ -219,9 +216,7 @@ class MaxPool {
   DEVICE inline T initial() { return static_cast<T>(-FLT_MAX); }
   DEVICE inline void compute(const T& x, T* y) { *y = *y > x ? *y : x; }
   DEVICE inline T atomic(T* address, const T val) {
-    // TODO(@caizhi): enable compiling CudaAtomicAdd
-    //return phi::CudaAtomicMax(address, val);
-    return val;
+    return phi::CudaAtomicMax(address, val);
   }
 };
 
@@ -231,9 +226,7 @@ class MinPool {
   DEVICE inline T initial() { return static_cast<T>(FLT_MAX); }
   DEVICE inline void compute(const T& x, T* y) { *y = *y < x ? *y : x; }
   DEVICE inline T atomic(T* address, const T val) {
-    // TODO(@caizhi): enable compiling CudaAtomicAdd
-    //return phi::CudaAtomicMin(address, val);
-    return val;
+    return phi::CudaAtomicMin(address, val);
   }
 };
 
@@ -243,9 +236,7 @@ class SumPool {
   DEVICE inline T initial() { return static_cast<T>(0); }
   DEVICE inline void compute(const T& x, T* y) { *y = *y + x; }
   DEVICE inline T atomic(T* address, const T val) {
-    // TODO(@caizhi): enable compiling CudaAtomicAdd
-    //return phi::CudaAtomicAdd(address, val);
-    return val;
+    return phi::CudaAtomicAdd(address, val);
   }
 };
 
diff --git a/paddle/phi/kernels/funcs/selected_rows_functor.cu b/paddle/phi/kernels/funcs/selected_rows_functor.cu
index 416c6b18b4c48..2947701befcc7 100644
--- a/paddle/phi/kernels/funcs/selected_rows_functor.cu
+++ b/paddle/phi/kernels/funcs/selected_rows_functor.cu
@@ -129,8 +129,7 @@ __global__ void SelectedRowsAddTensorKernel(const T* selected_rows,
     // Since index in rows of SelectedRows can be duplicate, we can not use
     // tensor_out[index] += selected_rows[index]; Instead, we have to use
     // AtomicAdd to avoid concurrent write error.
-    // TODO(@caizhi): enable it
-    // phi::CudaAtomicAdd(tensor_out + index, selected_rows[index]);
+    phi::CudaAtomicAdd(tensor_out + index, selected_rows[index]);
   }
 }
 }  // namespace
@@ -282,8 +281,7 @@ __global__ void SelectedRowsAddToTensorKernel(const T* selected_rows,
   for (int index = tid; index < row_numel; index += block_size) {
     // Since index in rows of SelectedRows can be duplicate, we have to use
     // Atomic Operation to avoid concurrent write error.
-    // TODO(@caizhi): enable it
-    // phi::CudaAtomicAdd(tensor_out + index, selected_rows[index]);
+    phi::CudaAtomicAdd(tensor_out + index, selected_rows[index]);
   }
 }
 }  // namespace
@@ -364,8 +362,7 @@ __global__ void MergeAddKernel(const T* input,
   input += ty * row_numel;
   out += out_idx * row_numel;
   for (int index = tid; index < row_numel; index += block_size) {
-    // TODO(@caizhi): enable it
-    // phi::CudaAtomicAdd(out + index, input[index]);
+    phi::CudaAtomicAdd(out + index, input[index]);
   }
 }
 

From 46876e15c0e40716d1bb4a4257711c708222b4f7 Mon Sep 17 00:00:00 2001
From: "zhi.cai" <zhi.cai@mthreads.com>
Date: Tue, 8 Aug 2023 23:27:27 +0800
Subject: [PATCH 33/55] [MTAI-484] fix(build): recover some deleted files

---
 cmake/phi.cmake                               |   4 +-
 paddle/phi/kernels/CMakeLists.txt             |   5 +
 paddle/phi/kernels/gpu/cholesky_kernel.cu     | 225 +++++++++++
 .../phi/kernels/gpu/cholesky_solve_kernel.cu  | 140 +++++++
 .../phi/kernels/gpu/cudnn_lstm_grad_kernel.cu | 312 +++++++++++++++
 paddle/phi/kernels/gpu/cudnn_lstm_kernel.cu   | 376 ++++++++++++++++++
 paddle/phi/kernels/gpu/svd_kernel.cu          | 268 +++++++++++++
 7 files changed, 1328 insertions(+), 2 deletions(-)
 create mode 100644 paddle/phi/kernels/gpu/cholesky_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/cholesky_solve_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/cudnn_lstm_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/cudnn_lstm_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/svd_kernel.cu

diff --git a/cmake/phi.cmake b/cmake/phi.cmake
index 3c234c6b93326..c160c2834abbd 100644
--- a/cmake/phi.cmake
+++ b/cmake/phi.cmake
@@ -103,8 +103,8 @@ function(kernel_declare TARGET_LIST)
           set(first_registry "")
         endif()
       endif()
-      # some gpu kernel only can run on cuda, not support rocm, so we add this branch
-      if(WITH_ROCM)
+      # some gpu kernel only can run on cuda, not support rocm and musa, so we add this branch
+      if(WITH_ROCM OR WITH_MUSA)
         string(FIND "${first_registry}" "cuda_only" pos)
         if(pos GREATER 1)
           set(first_registry "")
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index 8754b5bfbb6e6..b37dc8efb043f 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -49,6 +49,9 @@ file(
   "gpu/uniform_kernel.cu"
   "gpu/activation_kernel.cu"
   "gpu/full_kernel.cu"
+  "gpu/cholesky_kernel.cu"
+  "gpu/cholesky_solve_kernel.cu"
+  "gpu/svd_kernel.cu"
   "gpu/p_norm_grad_kernel.cu"
   "gpu/matmul_kernel.cu"
   "gpu/expand_kernel.cu"
@@ -71,6 +74,8 @@ list(REMOVE_ITEM kernel_cu
      "gpu/solve_grad_kernel.cu"
      "gpu/stft_kernel.cu"
      "gpu/conv_kernel.cu"
+     "gpu/cudnn_lstm_grad_kernel.cu"
+     "gpu/cudnn_lstm_kernel.cu"
      "gpu/softmax_kernel.cu"
      "gpu/slogdeterminant_grad_kernel.cu"
      "gpu/spectral_norm_grad_kernel.cu"
diff --git a/paddle/phi/kernels/gpu/cholesky_kernel.cu b/paddle/phi/kernels/gpu/cholesky_kernel.cu
new file mode 100644
index 0000000000000..daac0f0ac586e
--- /dev/null
+++ b/paddle/phi/kernels/gpu/cholesky_kernel.cu
@@ -0,0 +1,225 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#if !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
+// HIP and MUSA not support cusolver
+
+#include "paddle/phi/kernels/cholesky_kernel.h"
+
+#include <thrust/device_vector.h>
+
+#include <algorithm>
+#include <vector>
+
+#include "paddle/phi/backends/dynload/cusolver.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/memory_utils.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+
+namespace phi {
+
+template <typename T>
+struct MatrixBandPartFunctor {
+  /*! Set output as input value outside a central band and 0 inside that band.
+   * That is: output[i, j, ..., m, n] = in_band(m, n) * input[i, j, ..., m, n]
+   * where: in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) && (num_upper
+   * < 0 || (n-m) <= num_upper)
+   */
+  MatrixBandPartFunctor(const int m,
+                        const int n,
+                        const int num_lower_diags,
+                        const int num_upper_diags,
+                        const T* input,
+                        T* output)
+      : m_(m),
+        n_(n),
+        num_lower_diags_(num_lower_diags),
+        num_upper_diags_(num_upper_diags),
+        input_(input),
+        output_(output) {}
+
+  HOSTDEVICE void operator()(size_t index) const {
+    const int col = index % n_;
+    const int row = (index / n_) % m_;
+    const int band_start = (num_lower_diags_ < 0 ? 0 : row - num_lower_diags_);
+    const int band_end =
+        (num_upper_diags_ < 0 ? n_ : row + num_upper_diags_ + 1);
+    if (col < band_start || col >= band_end) {
+      output_[index] = static_cast<T>(0);
+    } else {
+      output_[index] = input_[index];
+    }
+  }
+
+  const int m_, n_, num_lower_diags_, num_upper_diags_;
+  const T* input_;
+  T* output_;
+};
+
+#define FUNC_WITH_TYPES(m) m(float, S) m(double, D)
+
+#define POTRF_INSTANCE(T, C)                                             \
+  void Potrf(const GPUContext& dev_ctx,                                  \
+             cublasFillMode_t uplo,                                      \
+             int n,                                                      \
+             T* A,                                                       \
+             int lda,                                                    \
+             int* info) {                                                \
+    auto handle = dev_ctx.cusolver_dn_handle();                          \
+    int workspace_size = 0;                                              \
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDn##C##potrf_bufferSize( \
+        handle, uplo, n, A, lda, &workspace_size));                      \
+    auto workspace = phi::memory_utils::Alloc(                           \
+        dev_ctx.GetPlace(),                                              \
+        workspace_size * sizeof(T),                                      \
+        phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream()))); \
+    T* workspace_ptr = reinterpret_cast<T*>(workspace->ptr());           \
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDn##C##potrf(            \
+        handle, uplo, n, A, lda, workspace_ptr, workspace_size, info));  \
+  }
+
+FUNC_WITH_TYPES(POTRF_INSTANCE);
+
+#if CUDA_VERSION >= 9020 && !defined(_WIN32)
+#define POTRF_BATCH_INSTANCE(T, C)                                   \
+  void PotrfBatched(const GPUContext& dev_ctx,                       \
+                    cublasFillMode_t uplo,                           \
+                    int n,                                           \
+                    T* Aarray[],                                     \
+                    int lda,                                         \
+                    int* info_array,                                 \
+                    int batch_size) {                                \
+    auto handle = dev_ctx.cusolver_dn_handle();                      \
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDn##C##potrfBatched( \
+        handle, uplo, n, Aarray, lda, info_array, batch_size));      \
+  }
+
+FUNC_WITH_TYPES(POTRF_BATCH_INSTANCE);
+#endif
+
+template <typename T, typename Context>
+void CholeskyKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    bool upper,
+                    DenseTensor* out) {
+  auto& dims = x.dims();
+  int batch_count = 1;
+  for (int i = 0; i < dims.size() - 2; i++) {
+    batch_count *= dims[i];
+  }
+  int m = dims[dims.size() - 1];
+  int tensor_size = batch_count * m * m;
+
+  const auto* x_data = x.data<T>();
+  auto* out_data = dev_ctx.template Alloc<T>(out);
+
+  // matrices are assumed to be stored in column-major order in cusolver
+  cublasFillMode_t uplo =
+      upper ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
+  // portf is inplace, thus copy the triangular part of the input matrices to
+  // the output and set the other triangular part to 0 firstly
+  phi::funcs::ForRange<GPUContext> for_range(dev_ctx, tensor_size);
+  if (upper) {
+    MatrixBandPartFunctor<T> matrix_band_part_functor(m,
+                                                      m,
+                                                      /* num_lower_diags */ 0,
+                                                      /* num_upper_diags */ m,
+                                                      x_data,
+                                                      out_data);
+    for_range(matrix_band_part_functor);
+  } else {
+    MatrixBandPartFunctor<T> matrix_band_part_functor(m,
+                                                      m,
+                                                      /* num_lower_diags */ m,
+                                                      /* num_upper_diags */ 0,
+                                                      x_data,
+                                                      out_data);
+    for_range(matrix_band_part_functor);
+  }
+
+  auto info = phi::memory_utils::Alloc(
+      dev_ctx.GetPlace(),
+      sizeof(int) * batch_count,
+      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
+  auto* info_ptr = reinterpret_cast<int*>(info->ptr());
+
+#if CUDA_VERSION >= 9020 && !defined(_WIN32)
+  if (batch_count > 1) {
+    std::vector<T*> output_ptrs;
+    for (int i = 0; i < batch_count; i++) {
+      output_ptrs.emplace_back(out_data + i * m * m);
+    }
+    thrust::device_vector<T*> dev_output_ptrs(output_ptrs.begin(),
+                                              output_ptrs.end());
+    PotrfBatched(dev_ctx,
+                 uplo,
+                 m,
+                 thrust::raw_pointer_cast(dev_output_ptrs.data()),
+                 m,
+                 info_ptr,
+                 batch_count);
+    // TODO(guosheng): There seems to a bug in cusolver potrfBatched and need
+    // to clear the upper triangle of the output. Remove this workaround once
+    // the bug is fixed.
+    if (!upper) {
+      MatrixBandPartFunctor<T> matrix_band_part_functor(m,
+                                                        m,
+                                                        /* num_lower_diags */ m,
+                                                        /* num_upper_diags */ 0,
+                                                        out_data,
+                                                        out_data);
+      for_range(matrix_band_part_functor);
+    }
+  } else {
+#endif
+    for (int i = 0; i < batch_count; i++) {
+      Potrf(dev_ctx, uplo, m, out_data + i * m * m, m, info_ptr + i);
+    }
+
+#if CUDA_VERSION >= 9020 && !defined(_WIN32)
+  }
+#endif
+  // check the info
+  std::vector<int> error_info;  // only for checking positive matrix
+  error_info.resize(batch_count);
+
+  memory_utils::Copy(CPUPlace(),
+                     error_info.data(),
+                     dev_ctx.GetPlace(),
+                     info_ptr,
+                     sizeof(int) * batch_count,
+                     dev_ctx.stream());
+
+  for (int i = 0; i < batch_count; ++i) {
+    PADDLE_ENFORCE_EQ(error_info[i],
+                      0,
+                      errors::PreconditionNotMet(
+                          "For batch [%d]: U(%d, %d) is zero, singular U.",
+                          i,
+                          error_info[i],
+                          error_info[i]));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(cholesky,  // cuda_only
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::CholeskyKernel,
+                   float,
+                   double) {}
+
+#endif  // not PADDLE_WITH_HIP && not PADDLE_WITH_MUSA
diff --git a/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu b/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu
new file mode 100644
index 0000000000000..7e0974f428c6a
--- /dev/null
+++ b/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu
@@ -0,0 +1,140 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
+// HIP and MUSA not support cusolver
+
+#include "paddle/phi/backends/dynload/cusolver.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
+#include "paddle/phi/kernels/impl/cholesky_solve_kernel_impl.h"
+
+namespace phi {
+
+template <typename T>
+void cusolver_potrs(const solverHandle_t &handle,
+                    cublasFillMode_t uplo,
+                    int M,
+                    int N,
+                    T *Adata,
+                    int lda,
+                    T *Bdata,
+                    int ldb,
+                    int *devInfo);
+
+template <>
+void cusolver_potrs<float>(const solverHandle_t &handle,
+                           cublasFillMode_t uplo,
+                           int M,
+                           int N,
+                           float *Adata,
+                           int lda,
+                           float *Bdata,
+                           int ldb,
+                           int *devInfo) {
+  PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnSpotrs(
+      handle, uplo, M, N, Adata, lda, Bdata, ldb, devInfo));
+}
+
+template <>
+void cusolver_potrs<double>(const solverHandle_t &handle,
+                            cublasFillMode_t uplo,
+                            int M,
+                            int N,
+                            double *Adata,
+                            int lda,
+                            double *Bdata,
+                            int ldb,
+                            int *devInfo) {
+  PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnDpotrs(
+      handle, uplo, M, N, Adata, lda, Bdata, ldb, devInfo));
+}
+
+template <>
+void cusolver_potrs<phi::dtype::complex<float>>(
+    const solverHandle_t &handle,
+    cublasFillMode_t uplo,
+    int M,
+    int N,
+    phi::dtype::complex<float> *Adata,
+    int lda,
+    phi::dtype::complex<float> *Bdata,
+    int ldb,
+    int *devInfo) {
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      dynload::cusolverDnCpotrs(handle,
+                                uplo,
+                                M,
+                                N,
+                                reinterpret_cast<const cuComplex *>(Adata),
+                                lda,
+                                reinterpret_cast<cuComplex *>(Bdata),
+                                ldb,
+                                devInfo));
+}
+
+template <>
+void cusolver_potrs<phi::dtype::complex<double>>(
+    const cusolverDnHandle_t &handle,
+    cublasFillMode_t uplo,
+    int M,
+    int N,
+    phi::dtype::complex<double> *Adata,
+    int lda,
+    phi::dtype::complex<double> *Bdata,
+    int ldb,
+    int *devInfo) {
+  PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnZpotrs(
+      handle,
+      uplo,
+      M,
+      N,
+      reinterpret_cast<const cuDoubleComplex *>(Adata),
+      lda,
+      reinterpret_cast<cuDoubleComplex *>(Bdata),
+      ldb,
+      devInfo));
+}
+
+template <typename T>
+class CholeskySolveFunctor<T, GPUContext> {
+ public:
+  void operator()(const GPUContext &dev_ctx,
+                  bool upper,
+                  int M,
+                  int N,
+                  T *Adata,
+                  int lda,
+                  T *Bdata,
+                  int *devInfo) {
+    cublasFillMode_t uplo =
+        upper ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
+    auto handle = dev_ctx.cusolver_dn_handle();
+    cusolver_potrs<T>(handle, uplo, M, N, Adata, lda, Bdata, lda, devInfo);
+  }
+};
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(cholesky_solve,  // cuda_only
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::CholeskySolveKernel,
+                   float,
+                   double) {}
+
+#endif  // not PADDLE_WITH_HIP && not PADDLE_WITH_MUSA
diff --git a/paddle/phi/kernels/gpu/cudnn_lstm_grad_kernel.cu b/paddle/phi/kernels/gpu/cudnn_lstm_grad_kernel.cu
new file mode 100644
index 0000000000000..661a1dd90e7e9
--- /dev/null
+++ b/paddle/phi/kernels/gpu/cudnn_lstm_grad_kernel.cu
@@ -0,0 +1,312 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cudnn_lstm_grad_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/cudnn_lstm_utils.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CudnnLSTMGradKernel(
+    const Context &ctx,
+    const DenseTensor &x,
+    const DenseTensor &init_h,
+    const DenseTensor &init_c,
+    const paddle::optional<std::vector<const DenseTensor *>> &weight_list,
+    const paddle::optional<DenseTensor> &sequence_length,
+    const DenseTensor &out,
+    const DenseTensor &reserve,
+    const DenseTensor &state_out,
+    const DenseTensor &out_grad,
+    const DenseTensor &last_h_grad,
+    const DenseTensor &last_c_grad,
+    float dropout_prob,
+    bool is_bidirec,
+    int hidden_size,
+    int num_layers,
+    bool is_test,
+    int seed,
+    DenseTensor *x_grad,
+    DenseTensor *init_h_grad,
+    DenseTensor *init_c_grad,
+    std::vector<DenseTensor *> weight_grad_list) {
+  auto input_dims = x.dims();
+  auto init_h_dims = init_h.dims();
+  auto init_c_dims = init_c.dims();
+
+  auto *init_h_data = init_h.data<T>();
+  auto *init_c_data = init_c.data<T>();
+  auto *out_data = out.data<T>();
+  auto *out_grad_data = out_grad.data<T>();
+  auto *last_h_grad_data = last_h_grad.data<T>();
+  auto *last_c_grad_data = last_c_grad.data<T>();
+
+  auto running_weight_list = *weight_list.get_ptr();
+  int weight_numel = size_sum(running_weight_list);
+  bool continuous = is_continuous<T, std::vector<const phi::DenseTensor *>>(
+      running_weight_list);
+
+  auto handle = ctx.cudnn_handle();
+  auto place = ctx.GetPlace();
+  auto stream = ctx.stream();
+  phi::DenseTensor weight_whole;
+  T *weight_data = nullptr;
+
+  if (!continuous) {
+    weight_whole.Resize({weight_numel});
+    ctx.template Alloc<T>(&weight_whole);
+    weight_to_tensor<T>(place, stream, running_weight_list, &weight_whole);
+    weight_data = weight_whole.data<T>();
+  } else {
+    weight_data = const_cast<T *>(running_weight_list[0]->data<T>());
+  }
+
+  phi::DenseTensor weight_grad;
+  phi::funcs::SetConstant<phi::GPUContext, T> zero;
+  weight_grad.Resize({weight_numel});
+  ctx.template Alloc<T>(&weight_grad);
+  zero(ctx, &weight_grad, static_cast<T>(0.0));
+  T *weight_grad_data = weight_grad.data<T>();
+
+  int offset = 0;
+  for (size_t i = 0; i < weight_grad_list.size(); ++i) {
+    size_t len = weight_grad_list[i]->numel();
+    auto dim = weight_grad_list[i]->dims();
+    weight_grad_list[i]
+        ->ShareDataWith(weight_grad.Slice(static_cast<int64_t>(offset),
+                                          static_cast<int64_t>(offset + len)))
+        .Resize(dim);
+    offset += len;
+  }
+
+  x_grad->Resize(input_dims);
+  ctx.template Alloc<T>(x_grad);
+  auto *in_grad_data = x_grad->data<T>();
+
+  if (init_h_grad) {
+    init_h_grad->Resize(init_h_dims);
+    ctx.template Alloc<T>(init_h_grad);
+  }
+  auto *init_h_grad_data = init_h_grad ? init_h_grad->data<T>() : nullptr;
+
+  if (init_c_grad) {
+    init_c_grad->Resize(init_c_dims);
+    ctx.template Alloc<T>(init_c_grad);
+  }
+  auto *init_c_grad_data = init_c_grad ? init_c_grad->data<T>() : nullptr;
+
+  auto running_seq_length = sequence_length.get_ptr();
+  bool has_seq_length = running_seq_length != nullptr;
+  std::vector<int> SequenceLength;
+  if (has_seq_length) {
+    SequenceLength = phi::GetVectorFromTensor<int>(running_seq_length);
+  }
+
+  int seq_length = input_dims[0];
+  int batch_size = x.dims()[1];
+  int input_size = x.dims()[2];
+
+  size_t workspace_size;
+  size_t reserve_size;
+
+  ScopedRNNBase rnn(seq_length,
+                    batch_size,
+                    input_size,
+                    hidden_size,
+                    num_layers,
+                    dropout_prob,
+                    seed,
+                    weight_numel,
+                    true,
+                    is_bidirec);
+
+  rnn.Create<T>(handle,
+                ctx.GetPlace(),
+                SequenceLength,
+                &workspace_size,
+                &reserve_size,
+                const_cast<phi::DenseTensor *>(&state_out));
+
+  phi::DenseTensor workspace_data_;
+  workspace_data_.Resize({static_cast<int64_t>(workspace_size)});
+  ctx.template Alloc<uint8_t>(&workspace_data_);
+  const uint8_t *reserve_data = reserve.data<uint8_t>();
+
+  if (!has_seq_length) {
+// This interface is used when the input/output is unpadded.
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::miopenRNNBackwardData(handle,
+                                            rnn.rnn_desc(),
+                                            seq_length,
+                                            rnn.y_descs(),
+                                            out_data,
+                                            rnn.y_descs(),
+                                            out_grad_data,
+                                            rnn.last_h_desc(),
+                                            last_h_grad_data,
+                                            rnn.last_c_desc(),
+                                            last_c_grad_data,
+                                            rnn.weight_desc(),
+                                            weight_data,
+                                            rnn.init_h_desc(),
+                                            init_h_data,
+                                            rnn.init_c_desc(),
+                                            init_c_data,
+                                            rnn.x_descs(),
+                                            in_grad_data,
+                                            rnn.init_h_desc(),
+                                            init_h_grad_data,
+                                            rnn.init_c_desc(),
+                                            init_c_grad_data,
+                                            workspace_data_.data<uint8_t>(),
+                                            workspace_size,
+                                            const_cast<uint8_t *>(reserve_data),
+                                            reserve_size));
+
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenRNNBackwardWeights(
+        handle,
+        rnn.rnn_desc(),
+        seq_length,
+        rnn.x_descs(),
+        x.data<T>(),
+        rnn.init_h_desc(),
+        init_h.data<T>(),
+        rnn.y_descs(),
+        out.data<T>(),
+        rnn.weight_desc(),
+        weight_grad_data,
+        workspace_data_.data<uint8_t>(),
+        workspace_size,
+        const_cast<uint8_t *>(reserve_data),
+        reserve_size));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnRNNBackwardData(handle,
+                                           rnn.rnn_desc(),
+                                           seq_length,
+                                           rnn.y_descs(),
+                                           out_data,
+                                           rnn.y_descs(),
+                                           out_grad_data,
+                                           rnn.last_h_desc(),
+                                           last_h_grad_data,
+                                           rnn.last_c_desc(),
+                                           last_c_grad_data,
+                                           rnn.weight_desc(),
+                                           weight_data,
+                                           rnn.init_h_desc(),
+                                           init_h_data,
+                                           rnn.init_c_desc(),
+                                           init_c_data,
+                                           rnn.x_descs(),
+                                           in_grad_data,
+                                           rnn.init_h_desc(),
+                                           init_h_grad_data,
+                                           rnn.init_c_desc(),
+                                           init_c_grad_data,
+                                           workspace_data_.data<uint8_t>(),
+                                           workspace_size,
+                                           const_cast<uint8_t *>(reserve_data),
+                                           reserve_size));
+
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNBackwardWeights(
+        handle,
+        rnn.rnn_desc(),
+        seq_length,
+        rnn.x_descs(),
+        x.data<T>(),
+        rnn.init_h_desc(),
+        init_h.data<T>(),
+        rnn.y_descs(),
+        out.data<T>(),
+        workspace_data_.data<uint8_t>(),
+        workspace_size,
+        rnn.weight_desc(),
+        weight_grad_data,
+        const_cast<uint8_t *>(reserve_data),
+        reserve_size));
+#endif
+  } else {
+#if !defined(PADDLE_WITH_HIP) && CUDNN_VERSION >= 7201
+    // for train
+    // This interface is used when the input/output is padded.
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNBackwardDataEx(
+        handle,
+        rnn.rnn_desc(),
+        rnn.y_seq_desc(),
+        out_data,
+        rnn.y_seq_desc(),
+        out_grad_data,
+        nullptr,
+        nullptr,
+        rnn.last_h_desc(),
+        last_h_grad_data,
+        rnn.last_c_desc(),
+        last_c_grad_data,
+        rnn.weight_desc(),
+        weight_data,
+        rnn.init_h_desc(),
+        init_h_data,
+        rnn.init_c_desc(),
+        init_c_data,
+        rnn.x_seq_desc(),
+        in_grad_data,
+        rnn.init_h_desc(),
+        init_h_grad_data,
+        rnn.init_c_desc(),
+        init_c_grad_data,
+        nullptr,
+        nullptr,
+        workspace_data_.data<uint8_t>(),
+        workspace_size,
+        const_cast<uint8_t *>(reserve_data),
+        reserve_size));
+
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNBackwardWeightsEx(
+        handle,
+        rnn.rnn_desc(),
+        rnn.x_seq_desc(),
+        x.data<T>(),
+        rnn.init_h_desc(),
+        init_h.data<T>(),
+        rnn.y_seq_desc(),
+        out.data<T>(),
+        workspace_data_.data<uint8_t>(),
+        workspace_size,
+        rnn.weight_desc(),
+        weight_grad_data,
+        const_cast<uint8_t *>(reserve_data),
+        reserve_size));
+#else
+    PADDLE_THROW(phi::errors::Unavailable(
+        "The padded input of rnn is supported by cudnnRNNBackwardDataEx, "
+        "cudnnRNNBackwardWeightsEx, but it only works when the version "
+        "of cudnn is larger than 7.2.1"));
+#endif
+  }
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(
+    cudnn_lstm_grad, GPU, ALL_LAYOUT, phi::CudnnLSTMGradKernel, float) {}
+#else
+PD_REGISTER_KERNEL(
+    cudnn_lstm_grad, GPU, ALL_LAYOUT, phi::CudnnLSTMGradKernel, float, double) {
+}
+#endif
diff --git a/paddle/phi/kernels/gpu/cudnn_lstm_kernel.cu b/paddle/phi/kernels/gpu/cudnn_lstm_kernel.cu
new file mode 100644
index 0000000000000..f3a03727e0bc4
--- /dev/null
+++ b/paddle/phi/kernels/gpu/cudnn_lstm_kernel.cu
@@ -0,0 +1,376 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cudnn_lstm_kernel.h"
+
+#include "glog/logging.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/cudnn_lstm_utils.h"
+
+namespace phi {
+
+template <typename T>
+#ifdef PADDLE_WITH_HIP
+void LSTMInferece(const bool &has_seq_length,
+                  const miopenHandle_t &handle,
+#else
+void LSTMInferece(const bool &has_seq_length,
+                  const cudnnHandle_t &handle,
+#endif
+                  const int &seq_length,
+                  ScopedRNNBase *rnn,
+                  const T *x_data,
+                  const T *init_h_data,
+                  const T *init_c_data,
+                  const T *w_data,
+                  T *out_data,
+                  T *last_h_data,
+                  T *last_c_data,
+                  phi::DenseTensor *workspace_data,
+                  const size_t &workspace_size) {
+  if (!has_seq_length) {
+// for inference
+// This interface is used when the input/output is unpadded.
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::miopenRNNForwardInference(handle,
+                                                rnn->rnn_desc(),
+                                                seq_length,
+                                                rnn->x_descs(),
+                                                x_data,
+                                                rnn->init_h_desc(),
+                                                init_h_data,
+                                                rnn->init_c_desc(),
+                                                init_c_data,
+                                                rnn->weight_desc(),
+                                                w_data,
+                                                rnn->y_descs(),
+                                                out_data,
+                                                rnn->last_h_desc(),
+                                                last_h_data,
+                                                rnn->last_c_desc(),
+                                                last_c_data,
+                                                workspace_data->data<uint8_t>(),
+                                                workspace_size));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnRNNForwardInference(handle,
+                                               rnn->rnn_desc(),
+                                               seq_length,
+                                               rnn->x_descs(),
+                                               x_data,
+                                               rnn->init_h_desc(),
+                                               init_h_data,
+                                               rnn->init_c_desc(),
+                                               init_c_data,
+                                               rnn->weight_desc(),
+                                               w_data,
+                                               rnn->y_descs(),
+                                               out_data,
+                                               rnn->last_h_desc(),
+                                               last_h_data,
+                                               rnn->last_c_desc(),
+                                               last_c_data,
+                                               workspace_data->data<uint8_t>(),
+                                               workspace_size));
+#endif
+  } else {
+#if !defined(PADDLE_WITH_HIP) && CUDNN_VERSION >= 7201
+    // for inference
+    // This interface is used when the input/output is padded.
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNForwardInferenceEx(
+        handle,
+        rnn->rnn_desc(),
+        rnn->x_seq_desc(),
+        x_data,
+        rnn->init_h_desc(),
+        init_h_data,
+        rnn->init_c_desc(),
+        init_c_data,
+        rnn->weight_desc(),
+        w_data,
+        rnn->y_seq_desc(),
+        out_data,
+        rnn->last_h_desc(),
+        last_h_data,
+        rnn->last_c_desc(),
+        last_c_data,
+        nullptr,
+        nullptr,
+        nullptr,
+        nullptr,
+        nullptr,
+        nullptr,
+        nullptr,
+        nullptr,
+        workspace_data->data<uint8_t>(),
+        workspace_size));
+#else
+    // CUDNN VERSION has to >=7.2.1
+    PADDLE_THROW(phi::errors::Unavailable(
+        "The padded input is supported by "
+        "cudnnRNNForwardInferenceEx, but it only works when "
+        "the version of cudnn is larger than 7.2.1"));
+#endif
+  }
+}
+
+template <typename T, typename Context>
+void CudnnLSTMKernel(
+    const Context &ctx,
+    const DenseTensor &x,
+    const DenseTensor &init_h,
+    const DenseTensor &init_c,
+    const paddle::optional<DenseTensor> &w,
+    const paddle::optional<std::vector<const DenseTensor *>> &weight_list,
+    const paddle::optional<DenseTensor> &sequence_length,
+    float dropout_prob,
+    bool is_bidirec,
+    int hidden_size,
+    int num_layers,
+    bool is_test,
+    int seed,
+    DenseTensor *out,
+    DenseTensor *last_h,
+    DenseTensor *last_c,
+    DenseTensor *reserve,
+    DenseTensor *state_out) {
+  const T *x_data = x.data<T>();
+  const T *init_h_data = init_h.data<T>();
+  const T *init_c_data = init_c.data<T>();
+
+  T *out_data = ctx.template Alloc<T>(out);
+  T *last_h_data = ctx.template Alloc<T>(last_h);
+  T *last_c_data = ctx.template Alloc<T>(last_c);
+
+  if (!is_test) {
+    if (seed == 0) {
+      // If not specify seed, use global Generator to generate seed.
+      int device_id = ctx.GetPlace().GetDeviceId();
+      auto gen_cuda = phi::DefaultCUDAGenerator(device_id);
+      seed = static_cast<int>(gen_cuda->Random64());
+    }
+  }
+
+  auto *running_sequence_length = sequence_length.get_ptr();
+  bool has_seq_length = running_sequence_length != nullptr;
+  std::vector<int> SequenceLength;
+  if (has_seq_length) {
+    SequenceLength = phi::GetVectorFromTensor<int>(running_sequence_length);
+  }
+
+  auto handle = ctx.cudnn_handle();
+
+  int seq_length = x.dims()[0];
+  int batch_size = x.dims()[1];
+  int input_size = x.dims()[2];
+  bool state_initialized = state_out->initialized() ? true : false;
+
+  size_t workspace_size;
+  size_t reserve_size;
+  phi::DenseTensor weight_whole;
+  T *w_data = nullptr;
+  int weight_numel;
+  bool w_initialized = false;
+  auto place = ctx.GetPlace();
+  auto stream = ctx.stream();
+  auto *running_w = w.get_ptr();
+  if (is_test && running_w != nullptr) {
+    w_initialized = running_w->initialized() ? true : false;
+    weight_numel = running_w->numel();
+  }
+  if (!w_initialized) {
+    auto running_weight_list = *weight_list.get_ptr();
+    bool continuous = is_continuous<T, std::vector<const phi::DenseTensor *>>(
+        running_weight_list);
+    weight_numel = size_sum(running_weight_list);
+
+    if (!continuous) {
+      LOG_FIRST_N(WARNING, 2)
+          << "If the memory space of the Input WeightList is not continuous, "
+             "less efficient calculation will be called. Please call "
+             "flatten_parameters() to make the input memory continuous.";
+      weight_whole.Resize({weight_numel});
+      ctx.template Alloc<T>(&weight_whole);
+      weight_to_tensor<T>(place, stream, running_weight_list, &weight_whole);
+      w_data = weight_whole.data<T>();
+      if (is_test) {  // maybe also reset small weights' ptr for training
+        int offset = 0;
+        for (size_t i = 0; i < running_weight_list.size(); ++i) {
+          size_t len = running_weight_list[i]->numel();
+          auto dim = running_weight_list[i]->dims();
+          const_cast<phi::DenseTensor *>(running_weight_list[i])
+              ->ShareDataWith(
+                  weight_whole.Slice(static_cast<int64_t>(offset),
+                                     static_cast<int64_t>(offset + len)))
+              .Resize(dim);
+          offset += len;
+        }
+      }
+    } else {
+      w_data = const_cast<T *>(running_weight_list[0]->data<T>());
+    }
+  } else {
+    w_data = const_cast<T *>(running_w->data<T>());
+  }
+
+  ScopedRNNBase rnn(seq_length,
+                    batch_size,
+                    input_size,
+                    hidden_size,
+                    num_layers,
+                    dropout_prob,
+                    seed,
+                    weight_numel,
+                    state_initialized,
+                    is_bidirec);
+  rnn.Create<T>(handle,
+                ctx.GetPlace(),
+                SequenceLength,
+                &workspace_size,
+                &reserve_size,
+                state_out);
+
+  phi::DenseTensor workspace_data_;
+  workspace_data_.Resize({static_cast<int64_t>(workspace_size)});
+  ctx.template Alloc<uint8_t>(&workspace_data_);
+
+  reserve->Resize({static_cast<int64_t>(reserve_size)});
+  auto *reserve_data = ctx.template Alloc<uint8_t>(reserve);
+
+  if (is_test) {
+    LSTMInferece<T>(has_seq_length,
+                    handle,
+                    seq_length,
+                    &rnn,
+                    x_data,
+                    init_h_data,
+                    init_c_data,
+                    w_data,
+                    out_data,
+                    last_h_data,
+                    last_c_data,
+                    &workspace_data_,
+                    workspace_size);
+  } else {
+    if (!has_seq_length) {
+// for train
+// This interface is used when the input/output is unpadded.
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenRNNForwardTraining(
+          handle,
+          rnn.rnn_desc(),
+          seq_length,
+          rnn.x_descs(),
+          x_data,
+          rnn.init_h_desc(),
+          init_h_data,
+          rnn.init_c_desc(),
+          init_c_data,
+          rnn.weight_desc(),
+          w_data,
+          rnn.y_descs(),
+          out_data,
+          rnn.last_h_desc(),
+          last_h_data,
+          rnn.last_c_desc(),
+          last_c_data,
+          workspace_data_.data<uint8_t>(),
+          workspace_size,
+          reserve_data,
+          reserve_size));
+#else
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          phi::dynload::cudnnRNNForwardTraining(handle,
+                                                rnn.rnn_desc(),
+                                                seq_length,
+                                                rnn.x_descs(),
+                                                x_data,
+                                                rnn.init_h_desc(),
+                                                init_h_data,
+                                                rnn.init_c_desc(),
+                                                init_c_data,
+                                                rnn.weight_desc(),
+                                                w_data,
+                                                rnn.y_descs(),
+                                                out_data,
+                                                rnn.last_h_desc(),
+                                                last_h_data,
+                                                rnn.last_c_desc(),
+                                                last_c_data,
+                                                workspace_data_.data<uint8_t>(),
+                                                workspace_size,
+                                                reserve_data,
+                                                reserve_size));
+#endif
+    } else {
+#if !defined(PADDLE_WITH_HIP) && CUDNN_VERSION >= 7201
+      // for train
+      // This interface is used when the input/output is padded.
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNForwardTrainingEx(
+          handle,
+          rnn.rnn_desc(),
+          rnn.x_seq_desc(),
+          x_data,
+          rnn.init_h_desc(),
+          init_h_data,
+          rnn.init_c_desc(),
+          init_c_data,
+          rnn.weight_desc(),
+          w_data,
+          rnn.y_seq_desc(),
+          out_data,
+          rnn.last_h_desc(),
+          last_h_data,
+          rnn.last_c_desc(),
+          last_c_data,
+          nullptr,
+          nullptr,
+          nullptr,
+          nullptr,
+          nullptr,
+          nullptr,
+          nullptr,
+          nullptr,
+          workspace_data_.data<uint8_t>(),
+          workspace_size,
+          reserve_data,
+          reserve_size));
+#else
+      PADDLE_THROW(phi::errors::Unavailable(
+          "The padded input is supported by "
+          "cudnnRNNForwardTrainingEx, but it only works when "
+          "the version of cudnn is larger than 7.2.1"));
+#endif
+    }
+  }
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(cudnn_lstm, GPU, ALL_LAYOUT, phi::CudnnLSTMKernel, float) {
+  kernel->InputAt(5).SetDataType(phi::DataType::INT32);
+  kernel->OutputAt(3).SetDataType(phi::DataType::UINT8);
+  kernel->OutputAt(4).SetDataType(phi::DataType::UINT8);
+}
+#else
+PD_REGISTER_KERNEL(
+    cudnn_lstm, GPU, ALL_LAYOUT, phi::CudnnLSTMKernel, float, double) {
+  kernel->InputAt(5).SetDataType(phi::DataType::INT32);
+  kernel->OutputAt(3).SetDataType(phi::DataType::UINT8);
+  kernel->OutputAt(4).SetDataType(phi::DataType::UINT8);
+}
+#endif
diff --git a/paddle/phi/kernels/gpu/svd_kernel.cu b/paddle/phi/kernels/gpu/svd_kernel.cu
new file mode 100644
index 0000000000000..2f041d988a17f
--- /dev/null
+++ b/paddle/phi/kernels/gpu/svd_kernel.cu
@@ -0,0 +1,268 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
+// HIP and MUSA not support cusolver
+
+#include "paddle/phi/kernels/svd_kernel.h"
+
+#include "paddle/phi/backends/dynload/cusolver.h"
+#include "paddle/phi/common/memory_utils.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+namespace phi {
+
+template <class T>
+static void GesvdjBatched(const phi::GPUContext& dev_ctx,
+                          int batchSize,
+                          int m,
+                          int n,
+                          int k,
+                          T* A,
+                          T* U,
+                          T* V,
+                          T* S,
+                          int* info,
+                          int thin_UV = 1);
+
+template <>
+void GesvdjBatched<float>(const phi::GPUContext& dev_ctx,
+                          int batchSize,
+                          int m,
+                          int n,
+                          int k,
+                          float* A,
+                          float* U,
+                          float* V,
+                          float* S,
+                          int* info,
+                          int thin_UV) {
+  /* compute singular vectors */
+  const cusolverEigMode_t jobz =
+      CUSOLVER_EIG_MODE_VECTOR; /* compute singular vectors */
+  gesvdjInfo_t gesvdj_params = NULL;
+  int lda = m;
+  int ldu = m;
+  int ldt = n;
+  int lwork = 0;
+  auto handle = dev_ctx.cusolver_dn_handle();
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cusolverDnSgesvdj_bufferSize(handle,
+                                                 jobz,
+                                                 thin_UV,
+                                                 m,
+                                                 n,
+                                                 A,
+                                                 lda,
+                                                 S,
+                                                 U,
+                                                 ldu,
+                                                 V,
+                                                 ldt,
+                                                 &lwork,
+                                                 gesvdj_params));
+  auto workspace = phi::memory_utils::Alloc(
+      dev_ctx.GetPlace(),
+      lwork * sizeof(float),
+      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
+  float* workspace_ptr = reinterpret_cast<float*>(workspace->ptr());
+  int stride_A = lda * n;
+  int stride_U = ldu * (thin_UV ? k : m);
+  int stride_V = ldt * (thin_UV ? k : n);
+  for (int i = 0; i < batchSize; ++i) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnSgesvdj(handle,
+                                                               jobz,
+                                                               thin_UV,
+                                                               m,
+                                                               n,
+                                                               A + stride_A * i,
+                                                               lda,
+                                                               S + k * i,
+                                                               U + stride_U * i,
+                                                               ldu,
+                                                               V + stride_V * i,
+                                                               ldt,
+                                                               workspace_ptr,
+                                                               lwork,
+                                                               info,
+                                                               gesvdj_params));
+    // check the error info
+    int error_info;
+    memory_utils::Copy(phi::CPUPlace(),
+                       &error_info,
+                       dev_ctx.GetPlace(),
+                       info,
+                       sizeof(int),
+                       dev_ctx.stream());
+    PADDLE_ENFORCE_EQ(
+        error_info,
+        0,
+        phi::errors::PreconditionNotMet(
+            "For batch [%d]: CUSolver SVD is not zero. [%d]", i, error_info));
+  }
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cusolverDnDestroyGesvdjInfo(gesvdj_params));
+}
+
+template <>
+void GesvdjBatched<double>(const phi::GPUContext& dev_ctx,
+                           int batchSize,
+                           int m,
+                           int n,
+                           int k,
+                           double* A,
+                           double* U,
+                           double* V,
+                           double* S,
+                           int* info,
+                           int thin_UV) {
+  /* compute singular vectors */
+  const cusolverEigMode_t jobz =
+      CUSOLVER_EIG_MODE_VECTOR; /* compute singular vectors */
+  gesvdjInfo_t gesvdj_params = NULL;
+  int lda = m;
+  int ldu = m;
+  int ldt = n;
+  int lwork = 0;
+  auto handle = dev_ctx.cusolver_dn_handle();
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cusolverDnDgesvdj_bufferSize(handle,
+                                                 jobz,
+                                                 thin_UV,
+                                                 m,
+                                                 n,
+                                                 A,
+                                                 lda,
+                                                 S,
+                                                 U,
+                                                 ldu,
+                                                 V,
+                                                 ldt,
+                                                 &lwork,
+                                                 gesvdj_params));
+  auto workspace = phi::memory_utils::Alloc(
+      dev_ctx.GetPlace(),
+      lwork * sizeof(double),
+      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
+  double* workspace_ptr = reinterpret_cast<double*>(workspace->ptr());
+  int stride_A = lda * n;
+  int stride_U = ldu * (thin_UV ? k : m);
+  int stride_V = ldt * (thin_UV ? k : n);
+  for (int i = 0; i < batchSize; ++i) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnDgesvdj(handle,
+                                                               jobz,
+                                                               thin_UV,
+                                                               m,
+                                                               n,
+                                                               A + stride_A * i,
+                                                               lda,
+                                                               S + k * i,
+                                                               U + stride_U * i,
+                                                               ldu,
+                                                               V + stride_V * i,
+                                                               ldt,
+                                                               workspace_ptr,
+                                                               lwork,
+                                                               info,
+                                                               gesvdj_params));
+    // check the error info
+    int error_info;
+    memory_utils::Copy(phi::CPUPlace(),
+                       &error_info,
+                       dev_ctx.GetPlace(),
+                       info,
+                       sizeof(int),
+                       dev_ctx.stream());
+    PADDLE_ENFORCE_EQ(
+        error_info,
+        0,
+        phi::errors::PreconditionNotMet(
+            "For batch [%d]: CUSolver SVD is not zero. [%d]", i, error_info));
+  }
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cusolverDnDestroyGesvdjInfo(gesvdj_params));
+}
+
+template <typename T, typename Context>
+void SvdKernel(const Context& dev_ctx,
+               const DenseTensor& X,
+               bool full_matrices,
+               DenseTensor* U,
+               DenseTensor* S,
+               DenseTensor* VH) {
+  auto& dims = X.dims();
+  int batch_count = 1;
+  for (int i = 0; i < dims.size() - 2; i++) {
+    batch_count *= dims[i];
+  }
+  int rank = dims.size();
+  int m = dims[rank - 2];
+  int n = dims[rank - 1];
+
+  PADDLE_ENFORCE_LT(
+      0,
+      m,
+      errors::InvalidArgument("The row of Input(X) should be greater than 0."));
+  PADDLE_ENFORCE_LT(
+      0,
+      n,
+      errors::InvalidArgument("The col of Input(X) should be greater than 0."));
+
+  auto* u_data = dev_ctx.template Alloc<phi::dtype::Real<T>>(U);
+  auto* vh_data = dev_ctx.template Alloc<phi::dtype::Real<T>>(VH);
+  auto* s_data = dev_ctx.template Alloc<phi::dtype::Real<T>>(S);
+  // NOTE:(@xiongkun03)
+  // matrices are assumed to be stored in column-major order in cusolver
+  // then view A as n x m and do A^T SVD, we can avoid transpose
+  // Must Copy X once, because the gesvdj will change the origin input matrix
+  DenseTensor x_tmp;
+  Copy(dev_ctx, X, dev_ctx.GetPlace(), false, &x_tmp);
+  auto info = Empty<int, Context>(dev_ctx, {batch_count});
+  int* info_ptr = reinterpret_cast<int*>(info.data());
+
+  GesvdjBatched<T>(dev_ctx,
+                   batch_count,
+                   n,
+                   m,
+                   std::min(m, n),
+                   dev_ctx.template Alloc<T>(&x_tmp),
+                   vh_data,
+                   u_data,
+                   s_data,
+                   info_ptr,
+                   !full_matrices);
+
+  auto UT_dim = U->dims();
+  std::swap(UT_dim[rank - 1], UT_dim[rank - 2]);  // Get the dim of UT_dim
+  U->Resize(UT_dim);                              // U is entirely UT
+  auto tmp_U = TransposeLast2Dim<T>(dev_ctx, *U);
+  U->ShareDataWith(tmp_U);  // U becomse UT, aka VT;
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(svd,  // cuda_only
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SvdKernel,
+                   float,
+                   double) {}
+
+#endif  // not PADDLE_WITH_HIP && not PADDLE_WITH_MUSA

From 502d524b89111cbf5eeebbce07d3d0393bd79304 Mon Sep 17 00:00:00 2001
From: "zhi.cai" <zhi.cai@mthreads.com>
Date: Thu, 10 Aug 2023 10:22:58 +0800
Subject: [PATCH 34/55] [MTAI-484] fix(build): enable -Werror in CMakelist.txt

---
 cmake/flags.cmake                            |  1 +
 cmake/musa.cmake                             | 17 +++++++++++++++++
 paddle/fluid/platform/dynload/musa_driver.h  |  4 ++--
 paddle/phi/backends/device_code.cc           |  3 +++
 paddle/phi/backends/gpu/gpu_resources.cc     |  3 +--
 paddle/phi/kernels/impl/matmul_kernel_impl.h |  1 +
 6 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 83b6de9e80218..ba8f297ca0704 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -143,6 +143,7 @@ if(NOT WIN32)
       -fPIC
       -fno-omit-frame-pointer
       -Wall
+      -Werror
       -Wextra
       -Wno-unused-parameter
       -Wno-unused-function
diff --git a/cmake/musa.cmake b/cmake/musa.cmake
index fddc1855e87ac..82694001e04f5 100644
--- a/cmake/musa.cmake
+++ b/cmake/musa.cmake
@@ -22,6 +22,23 @@ include_directories(/usr/lib/llvm-11/include/openmp/)
 #endmacro()
 #find_musa_version(${MUSA_PATH}/version.h)
 
+list(APPEND MUSA_MCC_FLAGS -Wno-unknown-warning-option)
+list(APPEND MUSA_MCC_FLAGS -Wno-macro-redefined)
+list(APPEND MUSA_MCC_FLAGS -Wno-unused-variable)
+list(APPEND MUSA_MCC_FLAGS -Wno-return-type)
+list(APPEND MUSA_MCC_FLAGS -Wno-sign-compare)
+list(APPEND MUSA_MCC_FLAGS -Wno-mismatched-tags)
+list(APPEND MUSA_MCC_FLAGS -Wno-pessimizing-move)
+list(APPEND MUSA_MCC_FLAGS -Wno-unused-but-set-variable)
+list(APPEND MUSA_MCC_FLAGS -Wno-bitwise-instead-of-logical)
+list(APPEND MUSA_MCC_FLAGS -Wno-format)
+list(APPEND MUSA_MCC_FLAGS -Wno-unused-local-typedef)
+list(APPEND MUSA_MCC_FLAGS -Wno-reorder-ctor)
+list(APPEND MUSA_MCC_FLAGS -Wno-braced-scalar-init)
+list(APPEND MUSA_MCC_FLAGS -Wno-pass-failed)
+list(APPEND MUSA_MCC_FLAGS -Wno-missing-braces)
+list(APPEND MUSA_MCC_FLAGS -Wno-dangling-gsl)
+
 if(WITH_CINN)
   list(APPEND MUSA_MCC_FLAGS -std=c++14)
 else()
diff --git a/paddle/fluid/platform/dynload/musa_driver.h b/paddle/fluid/platform/dynload/musa_driver.h
index a2f653d42bdfd..546a472e036b2 100644
--- a/paddle/fluid/platform/dynload/musa_driver.h
+++ b/paddle/fluid/platform/dynload/musa_driver.h
@@ -33,7 +33,7 @@ extern bool HasCUDADriver();
 /**
  * include all needed musa driver functions
  **/
-#define MUSA_ROUTINE_EACH(__macro)                      \
+#define PLATFORM_MUSA_ROUTINE_EACH(__macro)             \
   __macro(muInit);                                      \
   __macro(muDriverGetVersion);                          \
   __macro(muGetErrorString);                            \
@@ -49,7 +49,7 @@ extern bool HasCUDADriver();
   __macro(muDeviceGetAttribute);                        \
   __macro(muDeviceGet)
 
-MUSA_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MUSA_WRAP);
+PLATFORM_MUSA_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MUSA_WRAP);
 
 #undef PLATFORM_DECLARE_DYNAMIC_LOAD_MUSA_WRAP
 
diff --git a/paddle/phi/backends/device_code.cc b/paddle/phi/backends/device_code.cc
index 7974d2850cc30..bbb6a1ca23289 100644
--- a/paddle/phi/backends/device_code.cc
+++ b/paddle/phi/backends/device_code.cc
@@ -184,6 +184,7 @@ void GPUDeviceCode::CheckAvailableStatus() {
   if (CheckCUDADriverResult(dynload::hipGetDeviceCount(&count),
                             "hipGetDeviceCount")) {
 #elif defined(PADDLE_WITH_MUSA)
+	  (void)count;
   // TODO(@caizhi): enable dynload module
   // if (CheckCUDADriverResult(dynload::muDeviceGetCount(&count),
   //                           "muDeviceGetCount")) {
@@ -472,6 +473,8 @@ void GPUDeviceCode::Launch(const size_t n, std::vector<void*>* args) const {
       errors::External("Fail to launch kernel %s (in hipModuleLaunchKernel.)",
                        name_.c_str()));
 #elif defined(PADDLE_WITH_MUSA)
+  (void)num_blocks;
+  (void)dev_ctx;
   // TODO(@caizhi): enable dynload module
   // PADDLE_ENFORCE_EQ(
   //     dynload::muLaunchKernel(function_,
diff --git a/paddle/phi/backends/gpu/gpu_resources.cc b/paddle/phi/backends/gpu/gpu_resources.cc
index 72cedc993f5ce..93aa3380ae12b 100644
--- a/paddle/phi/backends/gpu/gpu_resources.cc
+++ b/paddle/phi/backends/gpu/gpu_resources.cc
@@ -75,10 +75,9 @@ void InitGpuProperties(Place place,
   *driver_version = backends::gpu::GetGPUDriverVersion(place.GetDeviceId());
   *runtime_version = backends::gpu::GetGPURuntimeVersion(place.GetDeviceId());
 
+#ifdef PADDLE_WITH_CUDA
   const gpuDeviceProp& prop =
       backends::gpu::GetDeviceProperties(place.GetDeviceId());
-
-#ifdef PADDLE_WITH_CUDA
   static const std::set<int> compiled_archs{CUDA_REAL_ARCHS};
   // Make sure compiled cuda arch is as same as runtime cuda arch.
   if (compiled_archs.find(*compute_capability) == compiled_archs.cend() &&
diff --git a/paddle/phi/kernels/impl/matmul_kernel_impl.h b/paddle/phi/kernels/impl/matmul_kernel_impl.h
index c372430a556e2..f9f861b67c740 100644
--- a/paddle/phi/kernels/impl/matmul_kernel_impl.h
+++ b/paddle/phi/kernels/impl/matmul_kernel_impl.h
@@ -1022,6 +1022,7 @@ void MatmulWithFlattenKernel(const Context& dev_ctx,
 
   auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
  
+  (void)blas;
   // TODO(@caizhi): enable it
   //blas.MatMul(x_matrix, y_matrix, out);
   if (z_dim.size() != 2) {

From 70ed88d973325ca6e2a28bc83b2fa2326d74ac7f Mon Sep 17 00:00:00 2001
From: CaiZhi <zhi.cai@mthreads.com>
Date: Thu, 10 Aug 2023 18:10:36 +0800
Subject: [PATCH 35/55] [MTAI-484] feat(cmake): improve cmake for MUSA

---
 cmake/musa.cmake    | 51 +++++++++++++++++++++++++++++++++++++++++----
 cmake/version.cmake |  6 ++++++
 2 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/cmake/musa.cmake b/cmake/musa.cmake
index b6192ca60a26e..3459c96633204 100644
--- a/cmake/musa.cmake
+++ b/cmake/musa.cmake
@@ -17,10 +17,53 @@ find_package(MUSA REQUIRED)
 include_directories(${MUSA_PATH}/include)
 include_directories(/usr/lib/llvm-11/include/openmp/)
 
-# TODO(@caizhi): enable finding musa version
-#macro(find_musa_version version_file)
-#endmacro()
-#find_musa_version(${MUSA_PATH}/version.h)
+macro(find_musa_version musa_version_file)
+  set(python_file ${PROJECT_BINARY_DIR}/get_version.py)
+  set(MUSA_VERSION "None" CACHE STRING "musa version" FORCE)
+  file(
+    WRITE ${python_file}
+    ""
+    "import json\n"
+    "import sys\n"
+    "with open(sys.argv[1], 'r') as f:\n"
+    "    data = json.load(f)\n"
+    "    print(data[\"MUSA_RUNTIME\"][\"version\"])"
+    "")
+
+  execute_process(
+    COMMAND "python" "${python_file}" ${musa_version_file}
+    WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
+    RESULT_VARIABLE python_res
+    OUTPUT_VARIABLE python_out
+    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+  if(python_res EQUAL 0)
+    set(MUSA_VERSION ${python_out})
+  endif()
+  string(REGEX REPLACE "([0-9]+)\.([0-9]+)\.([0-9]+)" "\\1" MUSA_MAJOR_VERSION "${MUSA_VERSION}")
+  string(REGEX REPLACE "([0-9]+)\.([0-9]+)\.([0-9]+)" "\\2" MUSA_MINOR_VERSION "${MUSA_VERSION}")
+  string(REGEX REPLACE "([0-9]+)\.([0-9]+)\.([0-9]+)" "\\3" MUSA_PATCH_VERSION "${MUSA_VERSION}")
+
+  if(NOT MUSA_MAJOR_VERSION)
+    set(MUSA_VERSION "???")
+    message(
+      WARNING "Cannot find MUSA version in ${MUSA_PATH}/version.json"
+    )
+  else()
+    math(
+      EXPR
+      MUSA_VERSION
+      "${MUSA_MAJOR_VERSION} * 10000 + ${MUSA_MINOR_VERSION} * 100   + ${MUSA_PATCH_VERSION}"
+    )
+    message(
+      STATUS
+        "Current MUSA version file is ${MUSA_PATH}/version.json.")
+    message(
+      STATUS
+	"Current MUSA version is v${MUSA_MAJOR_VERSION}.${MUSA_MINOR_VERSION}.${MUSA_PATCH_VERSION} ")
+  endif()
+endmacro()
+find_musa_version(${MUSA_PATH}/version.json)
 
 list(APPEND MUSA_MCC_FLAGS -Wno-unknown-warning-option)
 list(APPEND MUSA_MCC_FLAGS -Wno-macro-redefined)
diff --git a/cmake/version.cmake b/cmake/version.cmake
index e6707665a3851..039ae58a64c00 100644
--- a/cmake/version.cmake
+++ b/cmake/version.cmake
@@ -86,12 +86,18 @@ function(version version_file)
     "WITH_MKLDNN: ${WITH_MKLDNN}\n"
     "WITH_GPU: ${WITH_GPU}\n"
     "WITH_ROCM: ${WITH_ROCM}\n"
+    "WITH_MUSA: ${WITH_MUSA}\n"
     "WITH_IPU: ${WITH_IPU}\n")
   if(WITH_GPU)
     file(APPEND ${version_file}
          "CUDA version: ${CUDA_VERSION}\n"
          "CUDNN version: v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}\n")
   endif()
+  if(WITH_MUSA)
+    file(APPEND ${version_file}
+         "MUSA version: v${MUSA_MAJOR_VERSION}.${MUSA_MINOR_VERSION}.${MUSA_PATCH_VERSION}\n"
+         "MUDNN version: v${MUDNN_MAJOR_VERSION}.${MUDNN_MINOR_VERSION}\n")
+  endif()
   if(WITH_ROCM)
     file(APPEND ${version_file}
          "HIP version: v${HIP_MAJOR_VERSION}.${HIP_MINOR_VERSION}\n"

From 2909a0e9349f2774c2d64720ca58282059347103 Mon Sep 17 00:00:00 2001
From: CaiZhi <zhi.cai@mthreads.com>
Date: Thu, 10 Aug 2023 23:33:44 +0800
Subject: [PATCH 36/55] [MTAI-484] feat(cmake): improve cmake for mudnn/mccl

---
 cmake/configure.cmake                         |  3 ++
 cmake/flags.cmake                             |  3 +-
 cmake/generic.cmake                           | 28 ++++++++--
 cmake/mccl.cmake                              | 37 +++++++++++---
 cmake/mudnn.cmake                             | 51 ++++++++++++++-----
 cmake/musa.cmake                              | 16 +++++-
 paddle/fluid/operators/math/CMakeLists.txt    |  3 +-
 paddle/fluid/operators/math/sample_prob.cu    |  2 +-
 .../platform/device/gpu/gpu_launch_config.h   |  2 +
 9 files changed, 115 insertions(+), 30 deletions(-)

diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 2c769a83ef496..35e78b01b9bbe 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -179,6 +179,9 @@ elseif(WITH_MUSA)
   add_definitions(-DPADDLE_WITH_MUSA)
   add_definitions(-DEIGEN_USE_GPU)
   add_definitions(-DEIGEN_USE_MUSA)
+  if(NOT MUDNN_FOUND)
+    message(FATAL_ERROR "Paddle needs MUDNN to compile")
+  endif()
 else()
   add_definitions(-DHPPL_STUB_FUNC)
   list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index ba8f297ca0704..a32dea08e5bff 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -137,13 +137,12 @@ endif()
 # Do not care if this flag is support for gcc.
 
 # https://github.com/PaddlePaddle/Paddle/issues/12773
-# TODO(@caizhi): enable -Werror 
 if(NOT WIN32)
   set(COMMON_FLAGS
       -fPIC
       -fno-omit-frame-pointer
-      -Wall
       -Werror
+      -Wall
       -Wextra
       -Wno-unused-parameter
       -Wno-unused-function
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index b423ae0a4bb1b..b463a3377df56 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -787,7 +787,6 @@ function(musa_library TARGET_NAME)
     cmake_parse_arguments(musa_library "${options}" "${oneValueArgs}"
                           "${multiValueArgs}" ${ARGN})
     if(musa_library_SRCS)
-      # TODO(MTAI): enable compiling static library
       if(musa_library_SHARED OR musa_library_shared) # build *.so
         musa_add_library(${TARGET_NAME} SHARED ${musa_library_SRCS})
       else()
@@ -839,9 +838,25 @@ function(musa_binary TARGET_NAME)
   endif()
 endfunction()
 
-# TODO(@caizhi): enable musa_test
-#function(musa_test TARGET_NAME)
-#endfunction()
+function(musa_test TARGET_NAME)
+  if(WITH_MUSA AND WITH_TESTING)
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(musa_test "${options}" "${oneValueArgs}"
+                          "${multiValueArgs}" ${ARGN})
+    add_executable(${TARGET_NAME} ${musa_test_SRCS})
+    get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
+    target_link_libraries(${TARGET_NAME} ${musa_test_DEPS}
+                          ${os_dependency_modules} paddle_gtest_main phi)
+    add_dependencies(${TARGET_NAME} ${musa_test_DEPS} paddle_gtest_main)
+    common_link(${TARGET_NAME})
+    add_test(${TARGET_NAME} ${TARGET_NAME})
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT
+                                              FLAGS_cpu_deterministic=true)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT
+                                              FLAGS_init_allocated_mem=true)
+  endif()
+endfunction()
 
 function(xpu_library TARGET_NAME)
   if(WITH_XPU_KP)
@@ -1342,6 +1357,11 @@ function(math_library TARGET)
       ${TARGET}
       SRCS ${cc_srcs} ${cu_srcs}
       DEPS ${math_library_DEPS} ${math_common_deps})
+  elseif(WITH_MUSA)
+    musa_library(
+      ${TARGET}
+      SRCS ${cc_srcs} ${cu_srcs}
+      DEPS ${math_library_DEPS} ${math_common_deps})
   elseif(${cc_srcs_len} GREATER 0)
     cc_library(
       ${TARGET}
diff --git a/cmake/mccl.cmake b/cmake/mccl.cmake
index 12191a2711d46..6d39b39434e62 100644
--- a/cmake/mccl.cmake
+++ b/cmake/mccl.cmake
@@ -10,7 +10,7 @@ endif()
 # FIXME(MTAI): please make sure that we can find MCCL successfully
 if(WITH_MCCL)
   set(MCCL_ROOT
-      ${MUSA_PATH}/mccl
+      "/usr/local/musa/"
       CACHE PATH "MCCL ROOT")
   find_path(
     MCCL_INCLUDE_DIR mccl.h
@@ -18,14 +18,35 @@ if(WITH_MCCL)
           $ENV{MCCL_ROOT} $ENV{MCCL_ROOT}/include $ENV{MCCL_ROOT}/local/include
     NO_DEFAULT_PATH)
 
-  file(READ ${MCCL_INCLUDE_DIR}/mccl.h MCCL_VERSION_FILE_CONTENTS)
+  if(MCCL_INCLUDE_DIR)
+    file(READ ${MCCL_INCLUDE_DIR}/mccl.h MCCL_VERSION_FILE_CONTENTS)
 
-  string(REGEX MATCH "define NCCL_VERSION_CODE +([0-9]+)" MCCL_VERSION
-               "${MCCL_VERSION_FILE_CONTENTS}")
-  string(REGEX REPLACE "define NCCL_VERSION_CODE +([0-9]+)" "\\1" MCCL_VERSION
-                       "${MCCL_VERSION}")
+    string(REGEX MATCH "define MCCL_MAJOR +([0-9]+)" MCCL_MAJOR_VERSION
+                 "${MCCL_VERSION_FILE_CONTENTS}")
+    string(REGEX REPLACE "define MCCL_MAJOR +([0-9]+)" "\\1" MCCL_MAJOR_VERSION
+                 "${MCCL_MAJOR_VERSION}")
+    string(REGEX MATCH "define MCCL_MINOR +([0-9]+)" MCCL_MINOR_VERSION
+                 "${MCCL_VERSION_FILE_CONTENTS}")
+    string(REGEX REPLACE "define MCCL_MINOR +([0-9]+)" "\\1" MCCL_MINOR_VERSION
+    		 "${MCCL_MINOR_VERSION}")
+    string(REGEX MATCH "define MCCL_PATCH +([0-9]+)" MCCL_PATCH_VERSION
+                 "${MCCL_VERSION_FILE_CONTENTS}")
+    string(REGEX REPLACE "define MCCL_PATCH +([0-9]+)" "\\1" MCCL_PATCH_VERSION
+                 "${MCCL_PATCH_VERSION}")
+    if(NOT MCCL_MAJOR_VERSION)
+      set(MCCL_VERSION "???")
+    else()
+      math(EXPR MCCL_VERSION "${MCCL_MAJOR_VERSION} * 1000 +
+                 ${MCCL_MINOR_VERSION} * 100 + ${MCCL_PATCH_VERSION}")
+    endif()
+    add_definitions("-DMCCL_VERSION_CODE=$MCCL_VERSION")
+    include_directories(${MCCL_INCLUDE_DIR})
 
-  message(STATUS "Current MCCL header is ${MCCL_INCLUDE_DIR}/mccl.h. "
-                 "Current MCCL version is v${MCCL_VERSION}. ")
+    message(STATUS "Current MCCL header is ${MCCL_INCLUDE_DIR}/mccl.h. ")
+    message(STATUS "Current MCCL version is "
+        "v${MCCL_MAJOR_VERSION}.${MCCL_MINOR_VERSION}.${MCCL_PATCH_VERSION} ")
+  else()
+	  message(FATAL_ERROR "WITH_MCCL is enabled but mccl.h file is not found!")
+  endif()
 endif()
 
diff --git a/cmake/mudnn.cmake b/cmake/mudnn.cmake
index 4485139f7d01c..ab66620ad4c26 100644
--- a/cmake/mudnn.cmake
+++ b/cmake/mudnn.cmake
@@ -4,6 +4,10 @@ endif()
 
 if(WIN32)
   return()
+else()
+  set(MUDNN_ROOT
+      "/usr/local/musa"
+      CACHE PATH "MUDNN ROOT")
 endif()
 
 find_path(
@@ -12,8 +16,6 @@ find_path(
         $ENV{MUDNN_ROOT}/include ${MUSA_TOOLKIT_INCLUDE}
   NO_DEFAULT_PATH)
 
-get_filename_component(__libpath_hist ${MUSA_MUSART_LIBRARY} PATH)
-
 set(TARGET_ARCH "x86_64")
 if(NOT ${CMAKE_SYSTEM_PROCESSOR})
   set(TARGET_ARCH ${CMAKE_SYSTEM_PROCESSOR})
@@ -27,7 +29,6 @@ list(
   ${MUDNN_ROOT}/lib
   ${MUDNN_ROOT}/lib/x64
   ${MUDNN_ROOT}/lib/${TARGET_ARCH}-linux-gnu
-  ${MUDNN_ROOT}/local/cuda-${MUSA_VERSION}/targets/${TARGET_ARCH}-linux/lib/
   $ENV{MUDNN_ROOT}
   $ENV{MUDNN_ROOT}/lib64
   $ENV{MUDNN_ROOT}/lib
@@ -44,7 +45,7 @@ endif()
 find_library(
   MUDNN_LIBRARY
   NAMES ${MUDNN_LIB_NAME}
-  PATHS ${MUDNN_CHECK_LIBRARY_DIRS} ${MUDNN_INCLUDE_DIR} ${__libpath_hist}
+  PATHS ${MUDNN_CHECK_LIBRARY_DIRS} ${MUDNN_INCLUDE_DIR}
   NO_DEFAULT_PATH
   DOC "Path to muDNN library.")
 
@@ -54,14 +55,38 @@ else()
   set(MUDNN_FOUND OFF)
 endif()
 
-# TODO(@caizhi): enable mudnn finding
-#macro(find_cudnn_version cudnn_header_file)
-#endmacro()
+macro(find_mudnn_version mudnn_version_file)
+  file(READ ${mudnn_version_file} MUDNN_VERSION_FILE_CONTENTS)
+  get_filename_component(MUDNN_LIB_PATH ${MUDNN_LIBRARY} DIRECTORY)
 
-#if(MUDNN_FOUND)
-#  find_mudnn_version(${MUDNN_INCLUDE_DIR}/mudnn.h)
-#  if(NOT MUDNN_MAJOR_VERSION)
-#    find_mudnn_version(${MUDNN_INCLUDE_DIR}/mudnn_version.h)
-#  endif()
-#endif()
+  string(REGEX MATCH "define MUDNN_VERSION_MAJOR +([0-9]+)" MUDNN_MAJOR_VERSION
+               "${MUDNN_VERSION_FILE_CONTENTS}")
+  string(REGEX REPLACE "define MUDNN_VERSION_MAJOR +([0-9]+)" "\\1"
+               MUDNN_MAJOR_VERSION "${MUDNN_MAJOR_VERSION}")
+  string(REGEX MATCH "define MUDNN_VERSION_MINOR +([0-9]+)" MUDNN_MINOR_VERSION
+               "${MUDNN_VERSION_FILE_CONTENTS}")
+  string(REGEX REPLACE "define MUDNN_VERSION_MINOR +([0-9]+)" "\\1"
+	       MUDNN_MINOR_VERSION "${MUDNN_MINOR_VERSION}")
+  string(REGEX MATCH "define MUDNN_VERSION_PATCH +([0-9]+)" MUDNN_PATCH_VERSION
+               "${MUDNN_VERSION_FILE_CONTENTS}")
+  string(REGEX REPLACE "define MUDNN_VERSION_PATCH +([0-9]+)" "\\1"
+               MUDNN_PATCH_VERSION "${MUDNN_PATCH_VERSION}")
 
+  if(NOT MUDNN_MAJOR_VERSION)
+    set(MUDNN_VERSION "???")
+  else()
+    add_definitions("-DMUDNN_MAJOR_VERSION=\"${MUDNN_MAJOR_VERSION}\"")
+    math(EXPR MUDNN_VERSION "${MUDNN_MAJOR_VERSION} * 1000 +
+               ${MUDNN_MINOR_VERSION} * 100 + ${MUDNN_PATCH_VERSION}")
+    message(STATUS "Current muDNN version file is ${mudnn_version_file} ")
+    message(
+      STATUS
+        "Current muDNN version is v${MUDNN_MAJOR_VERSION}.${MUDNN_MINOR_VERSION}.${MUDNN_PATCH_VERSION}. "
+    )
+  endif()
+endmacro()
+
+if(MUDNN_FOUND)
+  find_mudnn_version(${MUDNN_INCLUDE_DIR}/mudnn_version.h)
+  include_directories(${MUDNN_INCLUDE_DIR})
+endif()
diff --git a/cmake/musa.cmake b/cmake/musa.cmake
index 3459c96633204..487db6e61ba7d 100644
--- a/cmake/musa.cmake
+++ b/cmake/musa.cmake
@@ -15,7 +15,19 @@ set(CMAKE_MODULE_PATH "${MUSA_PATH}/cmake" ${CMAKE_MODULE_PATH})
 
 find_package(MUSA REQUIRED)
 include_directories(${MUSA_PATH}/include)
-include_directories(/usr/lib/llvm-11/include/openmp/)
+
+# set openmp include directory
+set(llvm_search_list)
+foreach(item RANGE 6 20 1)
+  list(APPEND llvm_search_list /usr/lib/llvm-${item}/include/openmp/)
+endforeach()
+
+find_path(
+  OPENMP_INCLUDE_DIR omp.h
+  PATHS ${llvm_search_list}
+  REQUIRED
+  NO_DEFAULT_PATH)
+include_directories(${OPENMP_INCLUDE_DIR})
 
 macro(find_musa_version musa_version_file)
   set(python_file ${PROJECT_BINARY_DIR}/get_version.py)
@@ -68,6 +80,8 @@ find_musa_version(${MUSA_PATH}/version.json)
 list(APPEND MUSA_MCC_FLAGS -Wno-unknown-warning-option)
 list(APPEND MUSA_MCC_FLAGS -Wno-macro-redefined)
 list(APPEND MUSA_MCC_FLAGS -Wno-unused-variable)
+list(APPEND MUSA_MCC_FLAGS -Wno-deprecated-copy-with-user-provided-copy)
+list(APPEND MUSA_MCC_FLAGS -Wno-pragma-once-outside-header)
 list(APPEND MUSA_MCC_FLAGS -Wno-return-type)
 list(APPEND MUSA_MCC_FLAGS -Wno-sign-compare)
 list(APPEND MUSA_MCC_FLAGS -Wno-mismatched-tags)
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index af14333b9d1ea..7b14e8541fd02 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -21,5 +21,6 @@ endif()
 
 math_library(unpooling)
 math_library(prelu)
-math_library(bert_encoder_functor)
+# TODO(@caizhi): enable it
+#math_library(bert_encoder_functor)
 math_library(tree2col DEPS phi)
diff --git a/paddle/fluid/operators/math/sample_prob.cu b/paddle/fluid/operators/math/sample_prob.cu
index 53819488ac02b..5ab90409df1e7 100644
--- a/paddle/fluid/operators/math/sample_prob.cu
+++ b/paddle/fluid/operators/math/sample_prob.cu
@@ -164,7 +164,7 @@ void GPUSampleWithProb<T>::operator()(const phi::GPUContext& context,
   PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpy(samples_data + num_true,
                                         s_data,
                                         sizeof(int64_t) * num_samples,
-                                        hipMemcpyHostToDevice));
+                                        musaMemcpyHostToDevice));
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(samples_data + num_true,
                                         s_data,
diff --git a/paddle/fluid/platform/device/gpu/gpu_launch_config.h b/paddle/fluid/platform/device/gpu/gpu_launch_config.h
index 8ce858b4d37a1..fb38b790c8540 100644
--- a/paddle/fluid/platform/device/gpu/gpu_launch_config.h
+++ b/paddle/fluid/platform/device/gpu/gpu_launch_config.h
@@ -20,6 +20,8 @@
 
 #ifdef PADDLE_WITH_CUDA
 #include <cuda_runtime.h>
+#elif defined(PADDLE_WITH_MUSA)
+#include <musa_runtime.h>
 #else
 #include <hip/hip_runtime.h>
 #endif

From 0f4634a899096a0cd83435f7ac6db5be69af8013 Mon Sep 17 00:00:00 2001
From: CaiZhi <zhi.cai@mthreads.com>
Date: Fri, 11 Aug 2023 15:33:57 +0800
Subject: [PATCH 37/55] [MTAI-484] fix(build): delete temporary macro
 "PADDLE_WITH_MUSAAA"

---
 paddle/phi/kernels/batch_norm_kernel.cc             | 2 +-
 paddle/phi/kernels/sparse/batch_norm_grad_kernel.cc | 2 +-
 paddle/phi/kernels/sparse/batch_norm_kernel.cc      | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/phi/kernels/batch_norm_kernel.cc b/paddle/phi/kernels/batch_norm_kernel.cc
index 3f9050af76d8a..ec98f72732272 100644
--- a/paddle/phi/kernels/batch_norm_kernel.cc
+++ b/paddle/phi/kernels/batch_norm_kernel.cc
@@ -97,7 +97,7 @@ PD_REGISTER_KERNEL(batch_norm_infer,
 }
 #endif
 #endif
-#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(batch_norm_infer,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/sparse/batch_norm_grad_kernel.cc b/paddle/phi/kernels/sparse/batch_norm_grad_kernel.cc
index 37838bd02728b..ff3173ec0a101 100644
--- a/paddle/phi/kernels/sparse/batch_norm_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/batch_norm_grad_kernel.cc
@@ -88,7 +88,7 @@ PD_REGISTER_KERNEL(batch_norm_coo_grad,
 }
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA)
 PD_REGISTER_KERNEL(batch_norm_coo_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/sparse/batch_norm_kernel.cc b/paddle/phi/kernels/sparse/batch_norm_kernel.cc
index 947c62bd74a22..04ab36892513c 100644
--- a/paddle/phi/kernels/sparse/batch_norm_kernel.cc
+++ b/paddle/phi/kernels/sparse/batch_norm_kernel.cc
@@ -92,7 +92,7 @@ PD_REGISTER_KERNEL(batch_norm_coo,
 }
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA)
 PD_REGISTER_KERNEL(batch_norm_coo,
                    GPU,
                    ALL_LAYOUT,

From 7ed5e9f692792268e064d72c7a34bc249cf2c237 Mon Sep 17 00:00:00 2001
From: CaiZhi <zhi.cai@mthreads.com>
Date: Sun, 13 Aug 2023 18:26:07 +0800
Subject: [PATCH 38/55] [MTAI-484] fix(build): repleace murand_uniform with
 murand_uniform2

---
 paddle/phi/kernels/funcs/distribution_helper.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/phi/kernels/funcs/distribution_helper.h b/paddle/phi/kernels/funcs/distribution_helper.h
index 484f077583bef..4705370f71f7c 100644
--- a/paddle/phi/kernels/funcs/distribution_helper.h
+++ b/paddle/phi/kernels/funcs/distribution_helper.h
@@ -210,7 +210,7 @@ template <>
 struct uniform_distribution<double> {
   __device__ inline double2 operator()(
       murand_state_philox4x32_10 *state) const {
-    return murand_uniform_double2(state);
+    return murand_uniform2_double(state);
   }
   static constexpr int kReturnsCount = 2;
 };
@@ -248,7 +248,7 @@ template <>
 struct normal_distribution<double> {
   __device__ inline double2 operator()(
       murand_state_philox4x32_10 *state) const {
-    return murand_normal_double2(state);
+    return murand_normal2_double(state);
   }
   static constexpr int kReturnsCount = 2;
 };

From 68f25aa67fe2a5bd21e2f1750710fb38c4f9d4e7 Mon Sep 17 00:00:00 2001
From: CaiZhi <zhi.cai@mthreads.com>
Date: Fri, 11 Aug 2023 18:02:00 +0800
Subject: [PATCH 39/55] [MTAI-484] feat(build): list the list of unsupported cu
 files

---
 cmake/generic.cmake                           |   4 -
 cmake/mccl.cmake                              |   1 -
 cmake/musa.cmake                              |  13 +-
 paddle/phi/kernels/CMakeLists.txt             | 135 +++++++++++-------
 paddle/phi/kernels/funcs/CMakeLists.txt       |  25 ++--
 .../phi/kernels/funcs/distribution_helper.h   |   4 +-
 .../phi/kernels/funcs/top_k_function_cuda.h   |  37 +++++
 .../gpu/fused_dropout_add_grad_kernel.cu      |   4 +
 paddle/phi/kernels/gpu/bernoulli_kernel.cu    |   5 +-
 paddle/phi/kernels/gpu/dirichlet_kernel.cu    |   2 +-
 .../gpu/graph_sample_neighbors_kernel.cu      |   2 +-
 paddle/phi/kernels/gpu/kthvalue_kernel.cu     |  14 ++
 .../kernels/gpu/llm_int8_mat_mul_kernel.cu    |   2 +-
 .../phi/kernels/gpu/logsumexp_function.cu.h   |  22 ++-
 paddle/phi/kernels/gpu/lu_kernel.cu           |   4 +-
 .../phi/kernels/gpu/matrix_rank_tol_kernel.cu |   4 +-
 .../phi/kernels/gpu/multiclass_nms3_kernel.cu |   2 +-
 paddle/phi/kernels/gpu/multinomial_kernel.cu  |   2 +-
 paddle/phi/kernels/gpu/qr_kernel.cu           |   3 +-
 paddle/phi/kernels/gpu/randperm_kernel.cu     |   2 +-
 .../gpu/weighted_sample_neighbors_kernel.cu   |  14 ++
 paddle/phi/kernels/gpudnn/conv_grad_kernel.cu |   2 +-
 paddle/phi/kernels/gpudnn/conv_kernel.cu      |   4 +-
 .../gpudnn/conv_transpose_grad_kernel.cu      |   3 +-
 .../kernels/gpudnn/conv_transpose_kernel.cu   |   3 +-
 paddle/phi/kernels/gpudnn/pool_grad_kernel.cu |  11 +-
 paddle/phi/kernels/gpudnn/pool_kernel.cu      |   2 +-
 .../impl/repeat_interleave_grad_kernel_impl.h |   2 +-
 28 files changed, 217 insertions(+), 111 deletions(-)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index b463a3377df56..d7138ba1d596e 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -425,10 +425,6 @@ function(cc_binary TARGET_NAME)
   if(WITH_ROCM)
     target_link_libraries(${TARGET_NAME} ${ROCM_HIPRTC_LIB})
   endif()
-  # TODO(@caizhi): enable target_link_libraries for MUSA
-  #if(WITH_MUSA)
-  #  target_link_libraries(${TARGET_NAME} ${MUSA_LIB})
-  #endif()
 
   check_coverage_opt(${TARGET_NAME} ${cc_binary_SRCS})
 
diff --git a/cmake/mccl.cmake b/cmake/mccl.cmake
index 6d39b39434e62..86b8b42862791 100644
--- a/cmake/mccl.cmake
+++ b/cmake/mccl.cmake
@@ -7,7 +7,6 @@ if(WIN32)
   return()
 endif()
 
-# FIXME(MTAI): please make sure that we can find MCCL successfully
 if(WITH_MCCL)
   set(MCCL_ROOT
       "/usr/local/musa/"
diff --git a/cmake/musa.cmake b/cmake/musa.cmake
index 487db6e61ba7d..dcba332a23695 100644
--- a/cmake/musa.cmake
+++ b/cmake/musa.cmake
@@ -17,14 +17,14 @@ find_package(MUSA REQUIRED)
 include_directories(${MUSA_PATH}/include)
 
 # set openmp include directory
-set(llvm_search_list)
+set(llvm_openmp_search_list)
 foreach(item RANGE 6 20 1)
-  list(APPEND llvm_search_list /usr/lib/llvm-${item}/include/openmp/)
+  list(APPEND llvm_openmp_search_list /usr/lib/llvm-${item}/include/openmp/)
 endforeach()
 
 find_path(
   OPENMP_INCLUDE_DIR omp.h
-  PATHS ${llvm_search_list}
+  PATHS ${llvm_openmp_search_list}
   REQUIRED
   NO_DEFAULT_PATH)
 include_directories(${OPENMP_INCLUDE_DIR})
@@ -77,9 +77,7 @@ macro(find_musa_version musa_version_file)
 endmacro()
 find_musa_version(${MUSA_PATH}/version.json)
 
-list(APPEND MUSA_MCC_FLAGS -Wno-unknown-warning-option)
 list(APPEND MUSA_MCC_FLAGS -Wno-macro-redefined)
-list(APPEND MUSA_MCC_FLAGS -Wno-unused-variable)
 list(APPEND MUSA_MCC_FLAGS -Wno-deprecated-copy-with-user-provided-copy)
 list(APPEND MUSA_MCC_FLAGS -Wno-pragma-once-outside-header)
 list(APPEND MUSA_MCC_FLAGS -Wno-return-type)
@@ -89,7 +87,12 @@ list(APPEND MUSA_MCC_FLAGS -Wno-pessimizing-move)
 list(APPEND MUSA_MCC_FLAGS -Wno-unused-but-set-variable)
 list(APPEND MUSA_MCC_FLAGS -Wno-bitwise-instead-of-logical)
 list(APPEND MUSA_MCC_FLAGS -Wno-format)
+list(APPEND MUSA_MCC_FLAGS -Wno-self-assign)
+list(APPEND MUSA_MCC_FLAGS -Wno-literal-conversion)
+list(APPEND MUSA_MCC_FLAGS -Wno-unknown-warning-option)
+list(APPEND MUSA_MCC_FLAGS -Wno-unused-variable)
 list(APPEND MUSA_MCC_FLAGS -Wno-unused-local-typedef)
+list(APPEND MUSA_MCC_FLAGS -Wno-unused-lambda-capture)
 list(APPEND MUSA_MCC_FLAGS -Wno-reorder-ctor)
 list(APPEND MUSA_MCC_FLAGS -Wno-braced-scalar-init)
 list(APPEND MUSA_MCC_FLAGS -Wno-pass-failed)
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index b37dc8efb043f..d1fc9f6333e47 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -26,62 +26,95 @@ file(GLOB kernel_impl_h "impl/*.h" "selected_rows/impl/*.h")
 file(GLOB kernel_primitive_h "primitive/*.h")
 
 # fusion ops would be included here
-#file(
-#  GLOB kernel_cu
-#  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
-#  "gpu/*.cu"
-#  "gpu/*.cu.cc"
-#  "gpudnn/*.cu"
-#  "kps/*.cu"
-#  "legacy/kps/*.cu"
-#  "legacy/gpu/*.cu"
-#  "selected_rows/gpu/*.cu"
-#  "sparse/gpu/*.cu"
-#  "strings/gpu/*.cu"
-#  "fusion/gpu/*.cu")
 file(
   GLOB kernel_cu
   RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
-  "gpu/s*.cu.cc"
-  "gpu/c*.cu"
-  "gpu/s*.cu"
-  "gpu/abs_kernel.cu"
-  "gpu/uniform_kernel.cu"
-  "gpu/activation_kernel.cu"
-  "gpu/full_kernel.cu"
-  "gpu/cholesky_kernel.cu"
-  "gpu/cholesky_solve_kernel.cu"
-  "gpu/svd_kernel.cu"
-  "gpu/p_norm_grad_kernel.cu"
-  "gpu/matmul_kernel.cu"
-  "gpu/expand_kernel.cu"
-  "gpu/isfinite_kernel.cu"
+  "gpu/*.cu"
+  "gpu/*.cu.cc"
+  "gpudnn/*.cu"
   "kps/*.cu"
-  "legacy/gpu/uniform_kernel.cu"
-  "sparse/gpu/mask_kernel.cu"
   "legacy/kps/*.cu"
-  )
-list(REMOVE_ITEM kernel_cu
-     "gpu/check_numerics_kernel.cu"
-     "gpu/cross_entropy_grad_kernel.cu" 
-     "gpu/instance_norm_grad_kernel.cu"
-     "gpu/cross_entropy_kernel.cu"
-     "gpu/cholesky_grad_kernel.cu"
-     "gpu/cholesky_solve_grad_kernel.cu"
-     "gpu/conv_transpose_kernel.cu"
-     "gpu/conv_grad_kernel.cu"
-     "gpu/solve_kernel.cu"
-     "gpu/solve_grad_kernel.cu"
-     "gpu/stft_kernel.cu"
-     "gpu/conv_kernel.cu"
-     "gpu/cudnn_lstm_grad_kernel.cu"
-     "gpu/cudnn_lstm_kernel.cu"
-     "gpu/softmax_kernel.cu"
-     "gpu/slogdeterminant_grad_kernel.cu"
-     "gpu/spectral_norm_grad_kernel.cu"
-     "gpu/spectral_norm_kernel.cu"
-     "gpu/svd_grad_kernel.cu"
-     "gpu/conv_transpose_grad_kernel.cu")
+  "legacy/gpu/*.cu"
+  "selected_rows/gpu/*.cu"
+  "sparse/gpu/*.cu"
+  "strings/gpu/*.cu"
+  "fusion/gpu/*.cu")
+
+# FIXME(@MTAI): compilation error will occur when compiling the following files.
+# This need to be fixed later.
+if(WITH_MUSA)
+  list(REMOVE_ITEM kernel_cu
+       "fusion/gpu/fused_softmax_mask_grad_kernel.cu"
+       "fusion/gpu/fused_softmax_mask_kernel.cu"
+       "gpu/batch_norm_grad_kernel.cu"
+       "gpu/batch_norm_kernel.cu"
+       "gpu/check_numerics_kernel.cu"
+       "gpu/cholesky_grad_kernel.cu"
+       "gpu/cholesky_solve_grad_kernel.cu"
+       "gpu/conv_grad_kernel.cu"
+       "gpu/conv_kernel.cu"
+       "gpu/cross_entropy_grad_kernel.cu" 
+       "gpu/cross_entropy_kernel.cu"
+       "gpu/conv_transpose_grad_kernel.cu"
+       "gpu/conv_transpose_kernel.cu"
+       "gpu/cudnn_lstm_grad_kernel.cu"
+       "gpu/cudnn_lstm_kernel.cu"
+       "gpu/depthwise_conv_grad_kernel.cu"
+       "gpu/depthwise_conv_kernel.cu"
+       "gpu/dist_kernel.cu"
+       "gpu/elementwise_divide_grad_kernel.cu"
+       "gpu/elementwise_grad_kernel.cu"
+       "gpu/elementwise_multiply_grad_kernel.cu"
+       "gpu/erfinv_kernel.cu"
+       "gpu/exponential_kernel.cu"
+       "gpu/fft_grad_kernel.cu"
+       "gpu/fft_kernel.cu"
+       "gpu/fused_softmax_mask_grad_kernel.cu"
+       "gpu/gaussian_kernel.cu"
+       "gpu/gelu_grad_kernel.cu"
+       "gpu/gelu_kernel.cu"
+       "gpu/histogram_kernel.cu"
+       "gpu/instance_norm_grad_kernel.cu"
+       "gpu/instance_norm_kernel.cu"
+       "gpu/interpolate_grad_kernel.cu"
+       "gpu/kthvalue_grad_kernel.cu"
+       "gpu/kthvalue_kernel.cu"
+       "gpu/layer_norm_grad_kernel.cu"
+       "gpu/layer_norm_kernel.cu"
+       "gpu/llm_int8_mat_mul_kernel.cu"
+       "gpu/log_softmax_grad_kernel.cu"
+       "gpu/log_softmax_kernel.cu"
+       "gpu/lstsq_kernel.cu"
+       "gpu/nanmedian_kernel.cu"
+       "gpu/rnn_grad_kernel.cu.cc"
+       "gpu/rnn_kernel.cu.cc"
+       "gpu/slogdeterminant_grad_kernel.cu"
+       "gpu/solve_grad_kernel.cu"
+       "gpu/solve_kernel.cu"
+       "gpu/spectral_norm_grad_kernel.cu"
+       "gpu/spectral_norm_kernel.cu"
+       "gpu/stft_kernel.cu"
+       "gpu/svd_grad_kernel.cu"
+       "gpu/top_k_grad_kernel.cu"
+       "gpu/top_k_kernel.cu"
+       "gpu/truncated_gaussian_random_kernel.cu"
+       "gpudnn/affine_grid_grad_kernel.cu"
+       "gpudnn/affine_grid_kernel.cu"
+       "gpudnn/softmax_grad_kernel.cu"
+       "gpudnn/softmax_kernel.cu"
+       "gpudnn/conv_grad_kernel.cu"
+       "gpudnn/conv_kernel.cu"
+       "gpudnn/conv_transpose_grad_kernel.cu"
+       "gpudnn/conv_transpose_kernel.cu"
+       "gpudnn/pool_grad_kernel.cu"
+       "gpudnn/pool_kernel.cu"
+       "sparse/gpu/softmax_grad_kernel.cu"
+       "sparse/gpu/softmax_kernel.cu"
+       "sparse/gpu/conv_kernel.cu"
+       "sparse/gpu/pool_kernel.cu"
+       "strings/gpu/strings_copy_kernel.cu"
+       "strings/gpu/strings_lower_upper_kernel.cu")
+endif()
 
 if(APPLE OR WIN32)
   list(REMOVE_ITEM kernel_cu "fusion/gpu/fusion_group_kernel.cu")
diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt
index b4de20d2d3e91..7b1e010064259 100644
--- a/paddle/phi/kernels/funcs/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/CMakeLists.txt
@@ -8,26 +8,19 @@ file(
   GLOB func_cc_srcs
   RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
   "*.cc")
-# TODO(@caizhi): enable compiling all cu kernels
-#if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
-#  file(
-#    GLOB func_cu_srcs
-#    RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
-#    "*.cu")
-#endif()
 if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
   file(
     GLOB func_cu_srcs
     RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
-    "concat_and_split_functor.cu"
-    "math_function.cu"
-    "segment_pooling.cu"
-    "sequence_pooling.cu"
-    "softmax.cu"
-    "matrix_inverse.cu"
-    "im2col.cu"
-    "selected_rows_functor.cu"
-    "gather_scatter_functor.cu")
+    "*.cu")
+endif()
+
+# TODO(@MTAI): compilation error will occur when compiling the following files.
+# Compiler mcc need fix this bug.
+if(WITH_MUSA)
+  list(REMOVE_ITEM func_cu_srcs
+      "cross_entropy.cu"
+      "gru_compute.cu")
 endif()
 
 collect_srcs(kernels_srcs SRCS ${func_cc_srcs} ${func_cu_srcs})
diff --git a/paddle/phi/kernels/funcs/distribution_helper.h b/paddle/phi/kernels/funcs/distribution_helper.h
index 484f077583bef..4705370f71f7c 100644
--- a/paddle/phi/kernels/funcs/distribution_helper.h
+++ b/paddle/phi/kernels/funcs/distribution_helper.h
@@ -210,7 +210,7 @@ template <>
 struct uniform_distribution<double> {
   __device__ inline double2 operator()(
       murand_state_philox4x32_10 *state) const {
-    return murand_uniform_double2(state);
+    return murand_uniform2_double(state);
   }
   static constexpr int kReturnsCount = 2;
 };
@@ -248,7 +248,7 @@ template <>
 struct normal_distribution<double> {
   __device__ inline double2 operator()(
       murand_state_philox4x32_10 *state) const {
-    return murand_normal_double2(state);
+    return murand_normal2_double(state);
   }
   static constexpr int kReturnsCount = 2;
 };
diff --git a/paddle/phi/kernels/funcs/top_k_function_cuda.h b/paddle/phi/kernels/funcs/top_k_function_cuda.h
index 595b751da1530..5745c29afa2b2 100644
--- a/paddle/phi/kernels/funcs/top_k_function_cuda.h
+++ b/paddle/phi/kernels/funcs/top_k_function_cuda.h
@@ -1129,6 +1129,15 @@ bool SortTopk(const phi::GPUContext& ctx,
                  << hipGetErrorString(err);
       return false;
     }
+#elif defined(__MUSACC__)
+    if (err != musaSuccess) {
+      LOG(ERROR) << "TopKOP failed as could not launch "
+                    "cub::DeviceSegmentedRadixSort::SortPairsDescending to "
+                    "calculate "
+                    "temp_storage_bytes, status: "
+                 << musaGetErrorString(err);
+      return false;
+    }
 #else
     if (err != cudaSuccess) {
       LOG(ERROR)
@@ -1162,6 +1171,14 @@ bool SortTopk(const phi::GPUContext& ctx,
                  << hipGetErrorString(err);
       return false;
     }
+#elif defined(__MUSACC__)
+    if (err != musaSuccess) {
+      LOG(ERROR) << "TopKOP failed as could not launch "
+                    "cub::DeviceSegmentedRadixSort::SortPairs to calculate "
+                    "temp_storage_bytes, status: "
+                 << musaGetErrorString(err);
+      return false;
+    }
 #else
     if (err != cudaSuccess) {
       LOG(ERROR) << "TopKOP failed as could not launch "
@@ -1200,6 +1217,16 @@ bool SortTopk(const phi::GPUContext& ctx,
                  << ", status: " << hipGetErrorString(err);
       return false;
     }
+#elif defined(__MUSACC__)
+    if (err != musaSuccess) {
+      LOG(ERROR) << "TopKOP failed as could not launch "
+                    "cub::DeviceSegmentedRadixSort::SortPairsDescending to "
+                    "sort input, "
+                    "temp_storage_bytes: "
+                 << temp_storage_bytes
+                 << ", status: " << musaGetErrorString(err);
+      return false;
+    }
 #else
     if (err != cudaSuccess) {
       LOG(ERROR) << "TopKOP failed as could not launch "
@@ -1236,6 +1263,16 @@ bool SortTopk(const phi::GPUContext& ctx,
                  << ", status: " << hipGetErrorString(err);
       return false;
     }
+#elif defined(__MUSACC__)
+    if (err != musaSuccess) {
+      LOG(ERROR) << "TopKOP failed as could not launch "
+                    "cub::DeviceSegmentedRadixSort::SortPairs to "
+                    "sort input, "
+                    "temp_storage_bytes: "
+                 << temp_storage_bytes
+                 << ", status: " << musaGetErrorString(err);
+      return false;
+    }
 #else
     if (err != cudaSuccess) {
       LOG(ERROR) << "TopKOP failed as could not launch "
diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu
index dce2f8e5247e7..a2f3819f0300c 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu
@@ -105,6 +105,10 @@ __global__ void VectorizedDropoutBackward(const size_t n,
   hiprandStatePhilox4_32_10_t state;
   hiprand_init(seed, idx + THREAD_ID_X, increment, &state);
   using SType = hiprandStatePhilox4_32_10_t;
+#elif defined(PADDLE_WITH_MUSA)
+  murand_state_philox4x32_10 state;
+  murand_init(seed, idx + THREAD_ID_X, increment, &state);
+  using SType = murand_state_philox4x32_10;
 #else
   curandStatePhilox4_32_10_t state;
   curand_init(seed, idx + THREAD_ID_X, increment, &state);
diff --git a/paddle/phi/kernels/gpu/bernoulli_kernel.cu b/paddle/phi/kernels/gpu/bernoulli_kernel.cu
index 53021d90f4247..bad2f1737b375 100644
--- a/paddle/phi/kernels/gpu/bernoulli_kernel.cu
+++ b/paddle/phi/kernels/gpu/bernoulli_kernel.cu
@@ -46,9 +46,12 @@ __global__ void bernoulli_cuda_kernel(
 #if defined(__NVCC__)
   curandStatePhilox4_32_10_t state;
   curand_init(seed, thread_idx, offset, &state);
-#else
+#elif defined(__HIPCC__)
   hiprandStatePhilox4_32_10_t state;
   hiprand_init(seed, thread_idx, offset, &state);
+#elif defined(__MUSACC__)
+  murand_state_philox4x32_10 state;
+  murand_init(seed, thread_idx, offset, &state);
 #endif
 
   size_t total_thread = gridDim.x * blockDim.x;
diff --git a/paddle/phi/kernels/gpu/dirichlet_kernel.cu b/paddle/phi/kernels/gpu/dirichlet_kernel.cu
index bed4d840062f7..36c9c3621bb72 100644
--- a/paddle/phi/kernels/gpu/dirichlet_kernel.cu
+++ b/paddle/phi/kernels/gpu/dirichlet_kernel.cu
@@ -43,7 +43,7 @@ using COMPAT_RANDSTATEPHILOX4_32_10_T = hiprandStatePhilox4_32_10_t;
 #define COMPAT_RAND_UNIFORM hiprand_uniform
 #define COMPAT_RAND_NORMAL hiprand_normal
 #elif defined(PADDLE_WITH_MUSA)
-using COMPAT_RANDSTATEPHILOX4_32_10_T = murand_state_philox4x32_10_t;
+using COMPAT_RANDSTATEPHILOX4_32_10_T = murand_state_philox4x32_10;
 #define COMPAT_RAND_INIT murand_init
 #define COMPAT_RAND_UNIFORM murand_uniform
 #define COMPAT_RAND_NORMAL murand_normal
diff --git a/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu b/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu
index 416352d5cb6ea..97079e6a67821 100644
--- a/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu
@@ -230,7 +230,7 @@ __global__ void FisherYatesSampleKernel(const uint64_t rand_seed,
   hiprand_init(
       rand_seed * gridDim.x + blockIdx.x, threadIdx.y + threadIdx.x, 0, &rng);
 #elif defined(PADDLE_WITH_MUSA)
-  murand_state_philox4_32_10 rng;
+  murand_state_philox4x32_10 rng;
   murand_init(
       rand_seed * gridDim.x + blockIdx.x, threadIdx.y + threadIdx.x, 0, &rng);
 #else
diff --git a/paddle/phi/kernels/gpu/kthvalue_kernel.cu b/paddle/phi/kernels/gpu/kthvalue_kernel.cu
index 2ecec80c27b24..781ce67654320 100644
--- a/paddle/phi/kernels/gpu/kthvalue_kernel.cu
+++ b/paddle/phi/kernels/gpu/kthvalue_kernel.cu
@@ -93,6 +93,13 @@ bool SortKthvalue(const phi::GPUContext& dev_ctx,
                << hipGetErrorString(err);
     return false;
   }
+#elif defined(__MUSACC__)
+  if (err != musaSuccess) {
+    LOG(ERROR) << "KthvalueOP failed as could not launch "
+                  "cub::DeviceSegmentedRadixSort::SortPairs, status: "
+               << musaGetErrorString(err);
+    return false;
+  }
 #else
   if (err != cudaSuccess) {
     LOG(ERROR) << "KthvalueOP failed as could not launch "
@@ -125,6 +132,13 @@ bool SortKthvalue(const phi::GPUContext& dev_ctx,
                << temp_storage_bytes << ", status: " << hipGetErrorString(err);
     return false;
   }
+#elif defined(__MUSACC__)
+  if (err != musaSuccess) {
+    LOG(ERROR) << "KthvalueOP failed as could not launch "
+                  "cub::DeviceSegmentedRadixSort::SortPairs, "
+               << temp_storage_bytes << ", status: " << musaGetErrorString(err);
+    return false;
+  }
 #else
   if (err != cudaSuccess) {
     LOG(ERROR) << "KthvalueOP failed as could not launch "
diff --git a/paddle/phi/kernels/gpu/llm_int8_mat_mul_kernel.cu b/paddle/phi/kernels/gpu/llm_int8_mat_mul_kernel.cu
index 47aca6c4cafef..9f60e976eeea0 100644
--- a/paddle/phi/kernels/gpu/llm_int8_mat_mul_kernel.cu
+++ b/paddle/phi/kernels/gpu/llm_int8_mat_mul_kernel.cu
@@ -16,7 +16,7 @@
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
 #include "paddle/phi/kernels/impl/llm_int8_mat_mul_kernel_impl.h"
 #endif
 
diff --git a/paddle/phi/kernels/gpu/logsumexp_function.cu.h b/paddle/phi/kernels/gpu/logsumexp_function.cu.h
index 76d0b294f397b..5e23c3aab24ec 100644
--- a/paddle/phi/kernels/gpu/logsumexp_function.cu.h
+++ b/paddle/phi/kernels/gpu/logsumexp_function.cu.h
@@ -78,7 +78,7 @@ inline void GetNumBlocks(int64_t block_size,
   PADDLE_ENFORCE_GPU_SUCCESS(musaGetDevice(&dev));
   int sm_count;
   PADDLE_ENFORCE_GPU_SUCCESS(
-      cudaDeviceGetAttribute(&sm_count, musaDevAttrMultiProcessorCount, dev));
+      musaDeviceGetAttribute(&sm_count, musaDevAttrMultiProcessorCount, dev));
   int tpm;
   PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceGetAttribute(
       &tpm, musaDevAttrMaxThreadsPerMultiProcessor, dev));
@@ -301,6 +301,13 @@ inline cudaError_t DispatchLogsumexpWarpWithPadding(const Context& dev_ctx,
 template <typename T, typename SourceType, typename Context, int VecSize>
 #if PADDLE_WITH_HIP
 typename std::enable_if<VecSize == 1, hipError_t>::type
+DispatchLogsumexpWarpCols(const Context& dev_ctx,
+                          const int64_t num_row,
+                          const int64_t num_col,
+                          const SourceType* in,
+                          SourceType* out) {
+#elif defined(PADDLE_WITH_MUSA)
+typename std::enable_if<VecSize == 1, musaError_t>::type
 DispatchLogsumexpWarpCols(const Context& dev_ctx,
                           const int64_t num_row,
                           const int64_t num_col,
@@ -409,6 +416,13 @@ DispatchLogsumexpWarpCols(const Context& dev_ctx,
 template <typename T, typename SourceType, typename Context, int VecSize>
 #if PADDLE_WITH_HIP
 typename std::enable_if<VecSize == 2, hipError_t>::type
+DispatchLogsumexpWarpCols(const Context& dev_ctx,
+                          const int64_t num_row,
+                          const int64_t num_col,
+                          const SourceType* in,
+                          SourceType* out) {
+#elif defined(PADDLE_WITH_MUSA)
+typename std::enable_if<VecSize == 2, musaError_t>::type
 DispatchLogsumexpWarpCols(const Context& dev_ctx,
                           const int64_t num_row,
                           const int64_t num_col,
@@ -505,6 +519,12 @@ inline hipError_t DispatchLogsumexpWarp(const Context& dev_ctx,
                                         const int64_t num_col,
                                         const SourceType* in,
                                         SourceType* out) {
+#elif defined(PADDLE_WITH_MUSA)
+inline musaError_t DispatchLogsumexpWarp(const Context& dev_ctx,
+                                         const int64_t num_row,
+                                         const int64_t num_col,
+                                         const SourceType* in,
+                                         SourceType* out) {
 #else
 inline cudaError_t DispatchLogsumexpWarp(const Context& dev_ctx,
                                          const int64_t num_row,
diff --git a/paddle/phi/kernels/gpu/lu_kernel.cu b/paddle/phi/kernels/gpu/lu_kernel.cu
index d26826eccd156..c40a2616706b4 100644
--- a/paddle/phi/kernels/gpu/lu_kernel.cu
+++ b/paddle/phi/kernels/gpu/lu_kernel.cu
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef PADDLE_WITH_HIP
-// HIP not support cusolver
+#if !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
+// HIP and MUSA not support cusolver
 
 #include "paddle/phi/backends/dynload/cusolver.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
diff --git a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
index e4ee1f342131a..f8a52d0389b6a 100644
--- a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
+++ b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef PADDLE_WITH_HIP
-// HIP not support cusolver
+#if !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
+// HIP and MUSA not support cusolver
 
 #include "paddle/phi/kernels/matrix_rank_tol_kernel.h"
 
diff --git a/paddle/phi/kernels/gpu/multiclass_nms3_kernel.cu b/paddle/phi/kernels/gpu/multiclass_nms3_kernel.cu
index 531e30a880a48..7199018c5696d 100644
--- a/paddle/phi/kernels/gpu/multiclass_nms3_kernel.cu
+++ b/paddle/phi/kernels/gpu/multiclass_nms3_kernel.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
 
 #include "paddle/phi/kernels/multiclass_nms3_kernel.h"
 
diff --git a/paddle/phi/kernels/gpu/multinomial_kernel.cu b/paddle/phi/kernels/gpu/multinomial_kernel.cu
index 1f0f4fa4c493c..6ee5a95be8363 100644
--- a/paddle/phi/kernels/gpu/multinomial_kernel.cu
+++ b/paddle/phi/kernels/gpu/multinomial_kernel.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
 // To-do(qili93): fix this after issue resolved
 // https://github.com/ROCmSoftwarePlatform/rocPRIM/issues/202
 
diff --git a/paddle/phi/kernels/gpu/qr_kernel.cu b/paddle/phi/kernels/gpu/qr_kernel.cu
index 14f602cc95bd6..4c0c521002c75 100644
--- a/paddle/phi/kernels/gpu/qr_kernel.cu
+++ b/paddle/phi/kernels/gpu/qr_kernel.cu
@@ -12,7 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef PADDLE_WITH_HIP  // HIP not support cusolver
+#if !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
+// HIP and MUSA not support cusolver
 
 #include <thrust/device_vector.h>
 #include <algorithm>
diff --git a/paddle/phi/kernels/gpu/randperm_kernel.cu b/paddle/phi/kernels/gpu/randperm_kernel.cu
index 629f72243e97e..3c45fad3137ea 100644
--- a/paddle/phi/kernels/gpu/randperm_kernel.cu
+++ b/paddle/phi/kernels/gpu/randperm_kernel.cu
@@ -76,7 +76,7 @@ __global__ void SwapRepeatKernel(keyT* key_out_data,
   for (int i = repeat_size - 1; i > 0; i--) {
     uint32_t r = curand(&state) % (i + 1);
 #elif defined(__MUSACC__)
-  murandStatePhilox4_32_10_t state;
+  murand_state_philox4x32_10 state;
   murand_init(seed, idx, offset, &state);
   for (int i = repeat_size - 1; i > 0; i--) {
     uint32_t r = murand(&state) % (i + 1);
diff --git a/paddle/phi/kernels/gpu/weighted_sample_neighbors_kernel.cu b/paddle/phi/kernels/gpu/weighted_sample_neighbors_kernel.cu
index d4e0ca632e04d..be980f240bf34 100644
--- a/paddle/phi/kernels/gpu/weighted_sample_neighbors_kernel.cu
+++ b/paddle/phi/kernels/gpu/weighted_sample_neighbors_kernel.cu
@@ -377,6 +377,8 @@ void WeightedSampleNeighborsKernel(const Context& dev_ctx,
 
 #ifdef PADDLE_WITH_CUDA
   const auto& exec_policy = thrust::cuda::par.on(dev_ctx.stream());
+#elif defined(PADDLE_WITH_MUSA)
+  const auto& exec_policy = thrust::musa::par.on(dev_ctx.stream());
 #else
   const auto& exec_policy = thrust::hip::par.on(dev_ctx.stream());
 #endif
@@ -397,6 +399,18 @@ void WeightedSampleNeighborsKernel(const Context& dev_ctx,
                   cudaMemcpyDeviceToDevice,
                   dev_ctx.stream());
   cudaStreamSynchronize(dev_ctx.stream());
+#elif defined(PADDLE_WITH_MUSA)
+  musaMemcpyAsync(&total_sample_size,
+                  sample_offset_ptr + bs,
+                  sizeof(int),
+                  musaMemcpyDeviceToHost,
+                  dev_ctx.stream());
+  musaMemcpyAsync(out_count_data,
+                  sample_count_ptr,
+                  sizeof(int) * bs,
+                  musaMemcpyDeviceToDevice,
+                  dev_ctx.stream());
+  musaStreamSynchronize(dev_ctx.stream());
 #else
   hipMemcpyAsync(&total_sample_size,
                  sample_offset_ptr + bs,
diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
index 1b01f2b8131c9..7dde591c95042 100644
--- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
@@ -23,7 +23,7 @@
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/kernels/gpudnn/conv_miopen_helper.h"
 #elif defined(PADDLE_WITH_MUSA)
-#include "paddle/phi/kernels/gpudnn/conv_mudnn_helper.h"
+
 #else
 #include "paddle/phi/kernels/gpudnn/conv_cudnn_v7.h"
 #endif
diff --git a/paddle/phi/kernels/gpudnn/conv_kernel.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu
index e73ce989f0306..2193050bdc909 100644
--- a/paddle/phi/kernels/gpudnn/conv_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_kernel.cu
@@ -24,7 +24,7 @@
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/kernels/gpudnn/conv_miopen_helper.h"
 #elif defined(PADDLE_WITH_MUSA)
-#include "paddle/phi/kernels/gpudnn/conv_mudnn_helper.h"
+
 #else
 #include "paddle/phi/kernels/gpudnn/conv_cudnn_v7.h"
 #endif
@@ -370,7 +370,7 @@ void ConvCudnnKernel(const Context& ctx,
   const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
   auto dtype = phi::backends::gpu::CudnnDataType<T>::type;
 
-#ifd defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   // HIP MIOPEN ONLY SUPPORT NCHW format
   auto compute_format = phi::backends::gpu::DataLayout::kNCHW;
 #else
diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
index f30361864dbb0..a24f2281dc40a 100644
--- a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
@@ -33,8 +33,7 @@ limitations under the License. */
 #include "paddle/phi/backends/gpu/rocm/miopen_helper.h"
 #include "paddle/phi/kernels/gpudnn/conv_miopen_helper.h"
 #elif defined(PADDLE_WITH_MUSA)
-#include "paddle/phi/backends/gpu/musa/musa_helper.h"
-#include "paddle/phi/kernels/gpudnn/conv_musa_helper.h"
+
 #else
 #include "paddle/phi/backends/gpu/cuda/cudnn_helper.h"
 #include "paddle/phi/kernels/gpudnn/conv_cudnn_v7.h"
diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
index ed64723a40e4f..e2df9836796e9 100644
--- a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
@@ -31,8 +31,7 @@ limitations under the License. */
 #include "paddle/phi/backends/gpu/rocm/miopen_helper.h"
 #include "paddle/phi/kernels/gpudnn/conv_miopen_helper.h"
 #elif defined(PADDLE_WITH_MUSA)
-#include "paddle/phi/backends/gpu/rocm/mudnn_helper.h"
-#include "paddle/phi/kernels/gpudnn/conv_mudnn_helper.h"
+
 #else
 #include "paddle/phi/backends/gpu/cuda/cudnn_helper.h"
 #include "paddle/phi/kernels/gpudnn/conv_cudnn_v7.h"
diff --git a/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu b/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu
index a52e0e37d0e71..7f27177a69788 100644
--- a/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu
@@ -278,16 +278,7 @@ void PoolGradRawGPUDNNKernel(const Context& ctx,
                                                               pool_workspace));
     PADDLE_ENFORCE_GPU_SUCCESS(hipFree(pool_workspace));
 #elif defined(PADDLE_WITH_MUSA)
-    PADDLE_ENFORCE_GPU_SUCCESS(dynload::mudnnPoolingBackward(handle,
-                                                             cudnn_pool_desc,
-                                                             &alpha,
-                                                             cudnn_output_desc,
-                                                             output_data,
-                                                             cudnn_output_desc,
-                                                             output_grad_data,
-                                                             cudnn_input_desc,
-                                                             input_data,
-                                                             &beta,
+
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnPoolingBackward(handle,
                                                              cudnn_pool_desc,
diff --git a/paddle/phi/kernels/gpudnn/pool_kernel.cu b/paddle/phi/kernels/gpudnn/pool_kernel.cu
index 8a6ceb29690d2..3e472ff23dab4 100644
--- a/paddle/phi/kernels/gpudnn/pool_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/pool_kernel.cu
@@ -235,7 +235,7 @@ void PoolRawGPUDNNKernel(const Context& ctx,
     funcs::Transpose<Context, T, 5> trans5_v2;
     trans5_v2(ctx, transformed_output, output, axis);
   }
-#elif defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_HIP)
   // MIOPEN not support NHWC data layout
   if (data_format == str_NHWC) {
     std::vector<int> axis{0, 2, 3, 1};
diff --git a/paddle/phi/kernels/impl/repeat_interleave_grad_kernel_impl.h b/paddle/phi/kernels/impl/repeat_interleave_grad_kernel_impl.h
index 758d9ff3d0c50..4c0f236d0e90e 100644
--- a/paddle/phi/kernels/impl/repeat_interleave_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/repeat_interleave_grad_kernel_impl.h
@@ -23,7 +23,7 @@
 #include "paddle/phi/kernels/primitive/functor_primitives.h"
 #ifdef __NVCC__
 #include "cub/cub.cuh"
-#elif defined(__NVCC__)
+#elif defined(__MUSACC__)
 #include "cub/cub.cuh"
 #else
 #include <hipcub/hipcub.hpp>

From 145977d0bb4486647eaf52cd583d4c71b270b9d4 Mon Sep 17 00:00:00 2001
From: CaiZhi <zhi.cai@mthreads.com>
Date: Sun, 13 Aug 2023 22:52:11 +0800
Subject: [PATCH 40/55] [MTAI-484] fix(build): fix compiling error

---
 cmake/musa.cmake                              |  1 +
 paddle/fluid/framework/data_type.h            |  1 +
 paddle/phi/backends/gpu/gpu_primitives.h      | 78 +++++++++----------
 paddle/phi/kernels/CMakeLists.txt             |  1 -
 .../phi/kernels/gpu/check_numerics_kernel.cu  |  6 ++
 5 files changed, 44 insertions(+), 43 deletions(-)

diff --git a/cmake/musa.cmake b/cmake/musa.cmake
index dcba332a23695..e19745aef1ba1 100644
--- a/cmake/musa.cmake
+++ b/cmake/musa.cmake
@@ -91,6 +91,7 @@ list(APPEND MUSA_MCC_FLAGS -Wno-self-assign)
 list(APPEND MUSA_MCC_FLAGS -Wno-literal-conversion)
 list(APPEND MUSA_MCC_FLAGS -Wno-unknown-warning-option)
 list(APPEND MUSA_MCC_FLAGS -Wno-unused-variable)
+list(APPEND MUSA_MCC_FLAGS -Wno-unused-value)
 list(APPEND MUSA_MCC_FLAGS -Wno-unused-local-typedef)
 list(APPEND MUSA_MCC_FLAGS -Wno-unused-lambda-capture)
 list(APPEND MUSA_MCC_FLAGS -Wno-reorder-ctor)
diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h
index 672bac7d329ff..7e002c8154147 100644
--- a/paddle/fluid/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
@@ -130,6 +130,7 @@ inline void VisitDataType(proto::VarType::Type type, Visitor visitor) {
 #define VisitDataTypeCallback(cpp_type, proto_type) \
   do {                                              \
     if (type == proto_type) {                       \
+      visitor.template apply<cpp_type>();           \
       return;                                       \
     }                                               \
   } while (0)
diff --git a/paddle/phi/backends/gpu/gpu_primitives.h b/paddle/phi/backends/gpu/gpu_primitives.h
index e5d252d4ff89b..a4d75eec012da 100644
--- a/paddle/phi/backends/gpu/gpu_primitives.h
+++ b/paddle/phi/backends/gpu/gpu_primitives.h
@@ -231,23 +231,21 @@ __device__ __forceinline__ void fastAtomicAdd(T *arr,
 
 // NOTE(zhangbo): cuda do not have atomicCAS for __nv_bfloat16.
 inline static __device__ uint32_t bf16_add_to_low_half(uint32_t val, float x) {
-  return 0;
-  //phi::dtype::bfloat16 low_half;
-  //// the bfloat16 in lower 16bits
-  //low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
-  //low_half =
-  //    static_cast<phi::dtype::bfloat16>(static_cast<float>(low_half) + x);
-  //return (val & 0xFFFF0000u) | low_half.x;
+  phi::dtype::bfloat16 low_half;
+  // the bfloat16 in lower 16bits
+  low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
+  low_half =
+      static_cast<phi::dtype::bfloat16>(static_cast<float>(low_half) + x);
+  return (val & 0xFFFF0000u) | low_half.x;
 }
 
 inline static __device__ uint32_t bf16_add_to_high_half(uint32_t val, float x) {
-  return 0;
-  //phi::dtype::bfloat16 high_half;
-  //// the bfloat16 in higher 16bits
-  //high_half.x = static_cast<uint16_t>(val >> 16);
-  //high_half =
-  //    static_cast<phi::dtype::bfloat16>(static_cast<float>(high_half) + x);
-  //return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
+  phi::dtype::bfloat16 high_half;
+  // the bfloat16 in higher 16bits
+  high_half.x = static_cast<uint16_t>(val >> 16);
+  high_half =
+      static_cast<phi::dtype::bfloat16>(static_cast<float>(high_half) + x);
+  return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
 }
 
 #if CUDA_VERSION >= 11000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
@@ -451,23 +449,21 @@ CUDA_ATOMIC_WRAPPER(Max, phi::dtype::float16) {
 #endif
 
 inline static __device__ uint32_t bf16_max_to_low_half(uint32_t val, float x) {
-  return 0;
-  //phi::dtype::bfloat16 low_half;
-  //// The bfloat16 in lower 16bits
-  //low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
-  //low_half =
-  //    static_cast<phi::dtype::bfloat16>(max(static_cast<float>(low_half), x));
-  //return (val & 0xFFFF0000u) | low_half.x;
+  phi::dtype::bfloat16 low_half;
+  // The bfloat16 in lower 16bits
+  low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
+  low_half =
+      static_cast<phi::dtype::bfloat16>(max(static_cast<float>(low_half), x));
+  return (val & 0xFFFF0000u) | low_half.x;
 }
 
 inline static __device__ uint32_t bf16_max_to_high_half(uint32_t val, float x) {
-  return 0;
-  //phi::dtype::bfloat16 high_half;
-  //// The bfloat16 in higher 16bits
-  //high_half.x = static_cast<uint16_t>(val >> 16);
-  //high_half =
-  //    static_cast<phi::dtype::bfloat16>(max(static_cast<float>(high_half), x));
-  //return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
+  phi::dtype::bfloat16 high_half;
+  // The bfloat16 in higher 16bits
+  high_half.x = static_cast<uint16_t>(val >> 16);
+  high_half =
+      static_cast<phi::dtype::bfloat16>(max(static_cast<float>(high_half), x));
+  return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
 }
 
 CUDA_ATOMIC_WRAPPER(Max, phi::dtype::bfloat16) {
@@ -639,23 +635,21 @@ CUDA_ATOMIC_WRAPPER(Min, phi::dtype::float16) {
 #endif
 
 inline static __device__ uint32_t bf16_min_to_low_half(uint32_t val, float x) {
-  return 0;
-  //phi::dtype::bfloat16 low_half;
-  //// The bfloat16 in lower 16bits
-  //low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
-  //low_half =
-  //    static_cast<phi::dtype::bfloat16>(min(static_cast<float>(low_half), x));
-  //return (val & 0xFFFF0000u) | low_half.x;
+  phi::dtype::bfloat16 low_half;
+  // The bfloat16 in lower 16bits
+  low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
+  low_half =
+      static_cast<phi::dtype::bfloat16>(min(static_cast<float>(low_half), x));
+  return (val & 0xFFFF0000u) | low_half.x;
 }
 
 inline static __device__ uint32_t bf16_min_to_high_half(uint32_t val, float x) {
-  return 0;
-  //phi::dtype::bfloat16 high_half;
-  //// The bfloat16 in higher 16bits
-  //high_half.x = static_cast<uint16_t>(val >> 16);
-  //high_half =
-  //    static_cast<phi::dtype::bfloat16>(min(static_cast<float>(high_half), x));
-  //return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
+  phi::dtype::bfloat16 high_half;
+  // The bfloat16 in higher 16bits
+  high_half.x = static_cast<uint16_t>(val >> 16);
+  high_half =
+      static_cast<phi::dtype::bfloat16>(min(static_cast<float>(high_half), x));
+  return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
 }
 
 CUDA_ATOMIC_WRAPPER(Min, phi::dtype::bfloat16) {
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index d1fc9f6333e47..012ca2b6ad088 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -48,7 +48,6 @@ if(WITH_MUSA)
        "fusion/gpu/fused_softmax_mask_kernel.cu"
        "gpu/batch_norm_grad_kernel.cu"
        "gpu/batch_norm_kernel.cu"
-       "gpu/check_numerics_kernel.cu"
        "gpu/cholesky_grad_kernel.cu"
        "gpu/cholesky_solve_grad_kernel.cu"
        "gpu/conv_grad_kernel.cu"
diff --git a/paddle/phi/kernels/gpu/check_numerics_kernel.cu b/paddle/phi/kernels/gpu/check_numerics_kernel.cu
index 982c5794c56ed..3052e89820bec 100644
--- a/paddle/phi/kernels/gpu/check_numerics_kernel.cu
+++ b/paddle/phi/kernels/gpu/check_numerics_kernel.cu
@@ -500,6 +500,11 @@ void CheckNumericsKernel(const Context& ctx,
                          const std::string& output_dir,
                          DenseTensor* stats,
                          DenseTensor* values) {
+#ifdef PADDLE_WITH_MUSA
+  PADDLE_THROW(phi::errors::Unimplemented(
+      "OP check_numerics is unsupported for MUSA backend now!"));
+return;
+#else
   int dev_id = tensor.place().device;
   VLOG(6) << "op_type=" << op_type << ", var_name=" << var_name
           << ", dev_id=gpu:" << dev_id << ", numel=" << tensor.numel()
@@ -598,6 +603,7 @@ void CheckNumericsKernel(const Context& ctx,
     PrintStack<T>(ctx, *stats, op_type, var_name, dev_id);
   }
 #endif
+#endif
 }
 
 }  // namespace phi

From 4a2b9f1ec94dae19539934b2fbdb3be99d7a387c Mon Sep 17 00:00:00 2001
From: CaiZhi <zhi.cai@mthreads.com>
Date: Mon, 14 Aug 2023 01:37:03 +0800
Subject: [PATCH 41/55] [MTAI-484] fix(build): fix small compiling bugs for
 MUSA

---
 .../memory/allocation/allocator_facade.cc     |  2 +-
 paddle/fluid/operators/nccl/nccl_gpu_common.h |  2 +
 .../fluid/platform/device/gpu/CMakeLists.txt  |  1 -
 .../fluid/platform/device/gpu/nccl_helper.h   |  3 ++
 paddle/fluid/platform/enforce.h               |  5 +-
 .../fluid/platform/stream_callback_manager.cc |  7 +--
 paddle/phi/backends/device_code.cc            | 49 ++++++++-----------
 paddle/phi/kernels/batch_norm_kernel.cc       |  2 +-
 paddle/phi/kernels/funcs/eigen/slice.cu       |  4 +-
 .../kernels/funcs/gather_scatter_functor.h    |  2 +-
 paddle/phi/kernels/funcs/select_impl.cu.h     |  3 --
 11 files changed, 34 insertions(+), 46 deletions(-)

diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 64b0602ab6fc6..1d3cc8b260caa 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -655,7 +655,7 @@ class AllocatorFacadePrivate {
     auto chunk_size = FLAGS_auto_growth_chunk_size_in_mb << 20;
     VLOG(4) << "FLAGS_auto_growth_chunk_size_in_mb is "
             << FLAGS_auto_growth_chunk_size_in_mb;
-#if defined(PADDLE_WITH_HIP) ||  defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     auto cuda_allocator = CreateCUDAAllocator(p);
     cuda_allocators_[p][stream] = std::make_shared<AutoGrowthBestFitAllocator>(
         cuda_allocator,
diff --git a/paddle/fluid/operators/nccl/nccl_gpu_common.h b/paddle/fluid/operators/nccl/nccl_gpu_common.h
index 01905d8ca84b3..70342339a55a1 100644
--- a/paddle/fluid/operators/nccl/nccl_gpu_common.h
+++ b/paddle/fluid/operators/nccl/nccl_gpu_common.h
@@ -25,6 +25,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/device_context.h"
 #ifdef PADDLE_WITH_RCCL
 #include "paddle/fluid/platform/dynload/rccl.h"
+#elif defined(PADDLE_WITH_MCCL)
+#include "paddle/fluid/platform/dynload/mccl.h"
 #else
 #include "paddle/fluid/platform/dynload/nccl.h"
 #endif
diff --git a/paddle/fluid/platform/device/gpu/CMakeLists.txt b/paddle/fluid/platform/device/gpu/CMakeLists.txt
index f992901a46fd5..8b6b1ee1cda23 100644
--- a/paddle/fluid/platform/device/gpu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/gpu/CMakeLists.txt
@@ -27,7 +27,6 @@ elseif(WITH_MUSA)
     gpu_info
     SRCS gpu_info.cc
     DEPS phi glog enforce monitor dynload_cuda)
-
 endif()
 
 cc_library(
diff --git a/paddle/fluid/platform/device/gpu/nccl_helper.h b/paddle/fluid/platform/device/gpu/nccl_helper.h
index 6afcd2eb7cd97..be988cabdb5d1 100644
--- a/paddle/fluid/platform/device/gpu/nccl_helper.h
+++ b/paddle/fluid/platform/device/gpu/nccl_helper.h
@@ -32,6 +32,9 @@
 #ifdef PADDLE_WITH_RCCL
 #include "paddle/fluid/platform/dynload/rccl.h"
 #endif
+#ifdef PADDLE_WITH_MCCL
+#include "paddle/fluid/platform/dynload/mccl.h"
+#endif
 #include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index 51a7eb57d433c..160d6fb9912cb 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -42,9 +42,8 @@ limitations under the License. */
 #include <mublas.h>
 #include <mudnn.h>
 #include <mufft.h>
-// TODO(@caizhi): 
-//#include <murand.h>
-//#include <musparse.h>
+#include <murand.h>
+#include <musparse.h>
 #include <thrust/system/musa/error.h>
 #include <thrust/system_error.h>
 #endif  // PADDLE_WITH_MUSA
diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc
index b5f593193bfc2..c6e70b19fcf8c 100644
--- a/paddle/fluid/platform/stream_callback_manager.cc
+++ b/paddle/fluid/platform/stream_callback_manager.cc
@@ -25,9 +25,7 @@ static void StreamCallbackFunc(gpuStream_t stream,
                                void *user_data)
 #endif
 #ifdef PADDLE_WITH_MUSA
-static void StreamCallbackFunc(gpuStream_t stream,
-                               gpuError_t status,
-                               void *user_data)
+static void MUSART_CB StreamCallbackFunc(void *user_data)
 #endif
 #ifdef PADDLE_WITH_CUDA
 #if CUDA_VERSION >= 10000
@@ -65,8 +63,7 @@ void StreamCallbackManager<Stream>::AddCallback(
 #endif
 #ifdef PADDLE_WITH_MUSA
   PADDLE_ENFORCE_GPU_SUCCESS(
-      musaStreamAddCallback(stream_, StreamCallbackFunc, func, 0));
-      //musaLaunchHostFunc(stream_, StreamCallbackFunc, func));
+      musaLaunchHostFunc(stream_, StreamCallbackFunc, func));
 #endif
 #ifdef PADDLE_WITH_CUDA
 #if CUDA_VERSION >= 10000
diff --git a/paddle/phi/backends/device_code.cc b/paddle/phi/backends/device_code.cc
index bbb6a1ca23289..d9cd7268694d4 100644
--- a/paddle/phi/backends/device_code.cc
+++ b/paddle/phi/backends/device_code.cc
@@ -109,8 +109,7 @@ static bool CheckCUDADriverResult(MUresult result,
                                   std::string kernel_name = "") {
   if (result != MUSA_SUCCESS) {
     const char* error = nullptr;
-    // TODO(@caizhi): enable dynload module
-    // dynload::muGetErrorString(result, &error);
+    dynload::muGetErrorString(result, &error);
 #else
 static bool CheckCUDADriverResult(CUresult result,
                                   std::string caller,
@@ -153,10 +152,8 @@ void GPUDeviceCode::CheckAvailableStatus() {
   hipError_t driver_result = dynload::hipDriverGetVersion(&driver_version);
   if (driver_result == hipSuccess) {
 #elif defined(PADDLE_WITH_MUSA)
-  // TODO(@caizhi): enable dynload module
-  // MUresult driver_result = dynload::muDriverGetVersion(&driver_version);
-  // if (driver_result == MUSA_SUCCESS) {
-  if (true) {
+  MUresult driver_result = dynload::muDriverGetVersion(&driver_version);
+  if (driver_result == MUSA_SUCCESS) {
 #else
   CUresult driver_result = dynload::cuDriverGetVersion(&driver_version);
   if (driver_result == CUDA_SUCCESS) {
@@ -184,11 +181,8 @@ void GPUDeviceCode::CheckAvailableStatus() {
   if (CheckCUDADriverResult(dynload::hipGetDeviceCount(&count),
                             "hipGetDeviceCount")) {
 #elif defined(PADDLE_WITH_MUSA)
-	  (void)count;
-  // TODO(@caizhi): enable dynload module
-  // if (CheckCUDADriverResult(dynload::muDeviceGetCount(&count),
-  //                           "muDeviceGetCount")) {
-  if (true) {
+  if (CheckCUDADriverResult(dynload::muDeviceGetCount(&count),
+                            "muDeviceGetCount")) {
 #else
   if (CheckCUDADriverResult(dynload::cuDeviceGetCount(&count),
                             "cuDeviceGetCount")) {
@@ -473,24 +467,21 @@ void GPUDeviceCode::Launch(const size_t n, std::vector<void*>* args) const {
       errors::External("Fail to launch kernel %s (in hipModuleLaunchKernel.)",
                        name_.c_str()));
 #elif defined(PADDLE_WITH_MUSA)
-  (void)num_blocks;
-  (void)dev_ctx;
-  // TODO(@caizhi): enable dynload module
-  // PADDLE_ENFORCE_EQ(
-  //     dynload::muLaunchKernel(function_,
-  //                             num_blocks,
-  //                             1,
-  //                             1,  // grid dim
-  //                             num_threads_,
-  //                             1,
-  //                             1,                  // block dim
-  //                             0,                  // shared memory
-  //                             dev_ctx->stream(),  // stream
-  //                             args->data(),       // arguments
-  //                             nullptr),
-  //     MUSA_SUCCESS,
-  //     errors::External("Fail to launch kernel %s (in muLaunchKernel.)",
-  //                      name_.c_str()));
+  PADDLE_ENFORCE_EQ(
+      dynload::muLaunchKernel(function_,
+                              num_blocks,
+                              1,
+                              1,  // grid dim
+                              num_threads_,
+                              1,
+                              1,                  // block dim
+                              0,                  // shared memory
+                              dev_ctx->stream(),  // stream
+                              args->data(),       // arguments
+                              nullptr),
+      MUSA_SUCCESS,
+      errors::External("Fail to launch kernel %s (in muLaunchKernel.)",
+                       name_.c_str()));
 #else
   PADDLE_ENFORCE_EQ(
       dynload::cuLaunchKernel(function_,
diff --git a/paddle/phi/kernels/batch_norm_kernel.cc b/paddle/phi/kernels/batch_norm_kernel.cc
index ec98f72732272..bf04c99dab0a3 100644
--- a/paddle/phi/kernels/batch_norm_kernel.cc
+++ b/paddle/phi/kernels/batch_norm_kernel.cc
@@ -97,7 +97,7 @@ PD_REGISTER_KERNEL(batch_norm_infer,
 }
 #endif
 #endif
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
 PD_REGISTER_KERNEL(batch_norm_infer,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/funcs/eigen/slice.cu b/paddle/phi/kernels/funcs/eigen/slice.cu
index 64d7e243bc38d..ade58d0698759 100644
--- a/paddle/phi/kernels/funcs/eigen/slice.cu
+++ b/paddle/phi/kernels/funcs/eigen/slice.cu
@@ -39,7 +39,7 @@ struct EigenSlice<Eigen::GpuDevice, T, Rank> {
                    const InType& in,
                    const Array& offsets,
                    const Array& extents) {
-    //out.device(dev) = in.slice(offsets, extents);
+    out.device(dev) = in.slice(offsets, extents);
   }
 
   static void Eval(const Eigen::GpuDevice& dev,
@@ -47,7 +47,7 @@ struct EigenSlice<Eigen::GpuDevice, T, Rank> {
                    const InType32BitIndex& in,
                    const Array32Bit& offsets,
                    const Array32Bit& extents) {
-    //out.device(dev) = in.slice(offsets, extents);
+    out.device(dev) = in.slice(offsets, extents);
   }
 };
 
diff --git a/paddle/phi/kernels/funcs/gather_scatter_functor.h b/paddle/phi/kernels/funcs/gather_scatter_functor.h
index 7f06453299456..ee0581fe17cf9 100644
--- a/paddle/phi/kernels/funcs/gather_scatter_functor.h
+++ b/paddle/phi/kernels/funcs/gather_scatter_functor.h
@@ -27,7 +27,7 @@ namespace funcs {
       Instantiate_Template_Function_index_t(                                 \
           func, double) Instantiate_Template_Function_index_t(func, int64_t) \
           Instantiate_Template_Function_index_t(func, phi::dtype::float16)   \
-	      Instantiate_Template_Function_index_t(func,                    \
+          Instantiate_Template_Function_index_t(func,                        \
                                                     phi::dtype::bfloat16)    \
                   Instantiate_Template_Function_index_t(func, unsigned char)
 
diff --git a/paddle/phi/kernels/funcs/select_impl.cu.h b/paddle/phi/kernels/funcs/select_impl.cu.h
index 4cc5c7181f51e..32a4034890749 100644
--- a/paddle/phi/kernels/funcs/select_impl.cu.h
+++ b/paddle/phi/kernels/funcs/select_impl.cu.h
@@ -26,9 +26,6 @@
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
 #endif
-#ifdef __MUSACC__
-//TODO
-#endif
 
 #include <algorithm>
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"

From 5873467203ffe01be6ac96e47923f7876c140d6e Mon Sep 17 00:00:00 2001
From: CaiZhi <zhi.cai@mthreads.com>
Date: Mon, 14 Aug 2023 16:07:33 +0800
Subject: [PATCH 42/55] [MTAI-484] fix(build): disable mublasLt for MUSA

---
 .../fluid/inference/api/resource_manager.cc   |  7 +-
 paddle/fluid/inference/api/resource_manager.h |  8 +++
 paddle/fluid/platform/device/gpu/gpu_types.h  | 38 +++++-----
 paddle/phi/backends/gpu/forwards.h            |  1 -
 paddle/phi/backends/gpu/gpu_context.cc        | 10 +++
 paddle/phi/backends/gpu/gpu_context.h         |  4 ++
 paddle/phi/backends/gpu/gpu_decls.h           | 71 +++++++++++--------
 paddle/phi/backends/gpu/gpu_resources.cc      |  2 +
 paddle/phi/backends/gpu/gpu_resources.h       |  2 +
 9 files changed, 93 insertions(+), 50 deletions(-)

diff --git a/paddle/fluid/inference/api/resource_manager.cc b/paddle/fluid/inference/api/resource_manager.cc
index 2ec8e521b6eab..0efa4757b6f57 100644
--- a/paddle/fluid/inference/api/resource_manager.cc
+++ b/paddle/fluid/inference/api/resource_manager.cc
@@ -171,7 +171,9 @@ void GPUContextResource::DestroyGPUResource() {
 
   DestroyDnnHandle();
   DestroyBlasHandle();
+#ifndef PADDLE_WITH_MUSA
   DestroyBlasLtHandle();
+#endif
   DestroySolverHandle();
   DestroySparseHandle();
 }
@@ -209,7 +211,7 @@ void GPUContextResource::DestroyBlasHandle() {
   phi::DestroyBlasHandle(blas_tensor_core_handle_);
   phi::DestroyBlasHandle(blas_tf32_tensor_core_handle_);
 }
-
+#ifndef PADDLE_WITH_MUSA
 void GPUContextResource::InitBlasLtHandle() {
   phi::InitBlasLtHandle(&blaslt_handle_);
 }
@@ -217,6 +219,7 @@ void GPUContextResource::InitBlasLtHandle() {
 void GPUContextResource::DestroyBlasLtHandle() {
   phi::DestroyBlasLtHandle(blaslt_handle_);
 }
+#endif
 
 void GPUContextResource::InitSolverHandle() {
   phi::InitSolverHandle(&solver_handle_, stream_);
@@ -292,6 +295,7 @@ GPUContextResource::GetBlasTF32TensorCoreHandleCreator() {
   };
 }
 
+#ifndef PADDLE_WITH_MUSA
 blasLtHandle_t GPUContextResource::GetBlasLtHandle() const {
   return blaslt_handle_;
 }
@@ -303,6 +307,7 @@ GPUContextResource::GetBlasLtHandleCreator() {
     return blaslt_handle_;
   };
 }
+#endif
 
 phi::solverHandle_t GPUContextResource::GetSolverDnHandle() const {
   return solver_handle_;
diff --git a/paddle/fluid/inference/api/resource_manager.h b/paddle/fluid/inference/api/resource_manager.h
index 9686761029374..f8da0eff441bd 100644
--- a/paddle/fluid/inference/api/resource_manager.h
+++ b/paddle/fluid/inference/api/resource_manager.h
@@ -60,7 +60,9 @@ class GPUContextResource {
   std::function<phi::blasHandle_t()> GetBlasHandleCreator();
   std::function<phi::blasHandle_t()> GetBlasTensorCoreHandleCreator();
   std::function<phi::blasHandle_t()> GetBlasTF32TensorCoreHandleCreator();
+#ifndef PADDLE_WITH_MUSA
   std::function<phi::blasLtHandle_t()> GetBlasLtHandleCreator();
+#endif
   std::function<phi::solverHandle_t()> GetSolverDnHandleCreator();
   std::function<phi::sparseHandle_t()> GetSparseHandleCreator();
   std::function<Eigen::GpuDevice*()> GetGpuEigenDeviceCreator();
@@ -70,7 +72,9 @@ class GPUContextResource {
   blasHandle_t GetBlasHandle() const;
   blasHandle_t GetBlasTensorCoreHandle() const;
   blasHandle_t GetBlasTF32Handle() const;
+#ifndef PADDLE_WITH_MUSA
   blasLtHandle_t GetBlasLtHandle() const;
+#endif
   phi::solverHandle_t GetSolverDnHandle() const;
   phi::sparseHandle_t GetSparseHandle() const;
   Eigen::GpuDevice* GetGpuEigenDevice() const;
@@ -90,8 +94,10 @@ class GPUContextResource {
   void InitDnnHanlde();
   void DestroyDnnHandle();
   void DestroyBlasHandle();
+#ifndef PADDLE_WITH_MUSA
   void InitBlasLtHandle();
   void DestroyBlasLtHandle();
+#endif
   void InitSolverHandle();
   void DestroySolverHandle();
   void InitSparseHandle();
@@ -116,7 +122,9 @@ class GPUContextResource {
   blasHandle_t blas_handle_{nullptr};
   blasHandle_t blas_tensor_core_handle_{nullptr};
   blasHandle_t blas_tf32_tensor_core_handle_{nullptr};
+#ifndef PADDLE_WITH_MUSA
   blasLtHandle_t blaslt_handle_{nullptr};
+#endif
   dnnHandle_t dnn_handle_{nullptr};
   phi::solverHandle_t solver_handle_{nullptr};
   phi::sparseHandle_t sparse_handle_{nullptr};
diff --git a/paddle/fluid/platform/device/gpu/gpu_types.h b/paddle/fluid/platform/device/gpu/gpu_types.h
index 98d496a15ed57..76bd3a2937972 100644
--- a/paddle/fluid/platform/device/gpu/gpu_types.h
+++ b/paddle/fluid/platform/device/gpu/gpu_types.h
@@ -25,10 +25,8 @@
 
 #elif defined(PADDLE_WITH_MUSA)
 #include <musa_runtime.h>
-#include <mublas.h>
-#include <mudnn.h>
+#include "paddle/fluid/platform/dynload/mublas.h"
 using mudnnHandle_t = class Handle*;
-//TODO(Xiaokang Shang)
 #else
 #include <cuda_runtime.h>
 
@@ -58,13 +56,27 @@ DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t, musaEvent_t);
 DECLARE_TYPE_FOR_GPU(gpuMemcpyKind, cudaMemcpyKind, hipMemcpyKind, musaMemcpyKind);
 DECLARE_TYPE_FOR_GPU(gpuDeviceProp, cudaDeviceProp, hipDeviceProp_t, musaDeviceProp);
 
+DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t, mudnnHandle_t);
+DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle, mublasHandle_t);
+
+using CUDAGraphID = unsigned long long;  // NOLINT
+
+#undef DECLARE_TYPE_FOR_GPU
+
 // TODO(Xiaokang Shang): confirm mudnn type
-#if 0
-DECLARE_TYPE_FOR_GPU(dnnDataType_t, cudnnDataType_t, miopenDataType_t, mudnnDataType_t);
+#ifndef PADDLE_WITH_MUSA
+#ifdef PADDLE_WITH_HIP
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
+  using GPU_TYPE = ROCM_TYPE;
+#elif defined(PADDLE_WITH_CUDA)
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
+  using GPU_TYPE = CUDA_TYPE;
+#endif
+
+DECLARE_TYPE_FOR_GPU(dnnDataType_t, cudnnDataType_t, miopenDataType_t);
 DECLARE_TYPE_FOR_GPU(dnnActivationDescriptor,
                      cudnnActivationStruct,
-                     miopenActivationDescriptor,
-                     mudnnActivationStruct);
+                     miopenActivationDescriptor);
 DECLARE_TYPE_FOR_GPU(dnnActivationMode_t,
                      cudnnActivationMode_t,
                      miopenActivationMode_t);
@@ -93,19 +105,11 @@ DECLARE_TYPE_FOR_GPU(dnnPoolingMode_t, cudnnPoolingMode_t, miopenPoolingMode_t);
 DECLARE_TYPE_FOR_GPU(dnnDropoutDescriptor_t,
                      cudnnDropoutDescriptor_t,
                      miopenDropoutDescriptor_t);
-#endif
-
-DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle, mublasHandle_t); // TODO(MTAI)
-DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t, mudnnHandle_t);
-DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle, mublasHandle_t);
-
 // TODO(Ming Huang): Since there is no blasLt handler,
 // use rocblas_handle for workround.
-//DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle);
-
-using CUDAGraphID = unsigned long long;  // NOLINT
-
+DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle);
 #undef DECLARE_TYPE_FOR_GPU
+#endif
 
 #ifdef PADDLE_WITH_HIP
 #define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \
diff --git a/paddle/phi/backends/gpu/forwards.h b/paddle/phi/backends/gpu/forwards.h
index 0fc6b703e162d..441609102d4ea 100644
--- a/paddle/phi/backends/gpu/forwards.h
+++ b/paddle/phi/backends/gpu/forwards.h
@@ -79,7 +79,6 @@ using mublasHandle_t = struct _mublasHandle_t*;
 using mudnnHandle_t = class Handle*;
 // TODO(@caizhi): using correct type
 using musolverDnHandle_t = bool**;
-using mublasLtHandle_t = struct _mublasHandle_t*;
 using musparseHandle_t = struct _musparse_handle*;
 
 /// Forward declaration of ROCM types.
diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc
index 6fa9cb6a21460..5940d8966f6bc 100644
--- a/paddle/phi/backends/gpu/gpu_context.cc
+++ b/paddle/phi/backends/gpu/gpu_context.cc
@@ -291,7 +291,9 @@ struct GPUContext::Impl {
       phi::DestroyBlasHandle(blas_handle_);
       phi::DestroyBlasHandle(blas_tensor_core_handle_);
       phi::DestroyBlasHandle(blas_tf32_tensor_core_handle_);
+#ifndef PADDLE_WITH_MUSA
       phi::DestroyBlasLtHandle(blaslt_handle_);
+#endif
     }
     if (stream_owned_ && stream_) {
       delete stream_;
@@ -452,6 +454,7 @@ struct GPUContext::Impl {
     blas_tf32_tensor_core_handle_creator_ = std::move(handle_creator);
   }
 
+#ifndef PADDLE_WITH_MUSA
   void SetBlasLtHandle(blasLtHandle_t blaslt) { blaslt_handle_ = blaslt; }
 
   void SetBlasLtHandle(std::function<blasLtHandle_t()>&& handle_creator) {
@@ -470,6 +473,7 @@ struct GPUContext::Impl {
     PD_CHECK(blaslt_handle_ != nullptr, "the gpu blasLt handle is nullptr.");
     return blaslt_handle_;
   }
+#endif
 
   dnnHandle_t GetDnnHandle() {
     std::call_once(flag_dnn_, [&]() {
@@ -819,8 +823,10 @@ struct GPUContext::Impl {
   std::function<blasHandle_t()> blas_tensor_core_handle_creator_{nullptr};
   blasHandle_t blas_tf32_tensor_core_handle_{nullptr};
   std::function<blasHandle_t()> blas_tf32_tensor_core_handle_creator_{nullptr};
+#ifndef PADDLE_WITH_MUSA
   blasLtHandle_t blaslt_handle_{nullptr};
   std::function<blasLtHandle_t()> blaslt_handle_creator_{nullptr};
+#endif
   dnnHandle_t dnn_handle_{nullptr};
   std::function<dnnHandle_t()> dnn_handle_creator_{nullptr};
   solverHandle_t solver_handle_{nullptr};
@@ -894,9 +900,11 @@ blasHandle_t GPUContext::cublas_handle() const {
   return impl_->GetBlasHandle();
 }
 
+#ifndef PADDLE_WITH_MUSA
 blasLtHandle_t GPUContext::cublaslt_handle() const {
   return impl_->GetBlasLtHandle();
 }
+#endif
 
 solverHandle_t GPUContext::cusolver_dn_handle() const {
   return impl_->GetSolverHandle();
@@ -1020,6 +1028,7 @@ void GPUContext::SetBlasTF32Handle(std::function<blasHandle_t()>&& func) {
   impl_->SetBlasTF32Handle(std::move(func));
 }
 
+#ifndef PADDLE_WITH_MUSA
 void GPUContext::SetBlasLtHandle(blasLtHandle_t blaslt) {
   impl_->SetBlasLtHandle(blaslt);
 }
@@ -1027,6 +1036,7 @@ void GPUContext::SetBlasLtHandle(blasLtHandle_t blaslt) {
 void GPUContext::SetBlasLtHandle(std::function<blasLtHandle_t()>&& func) {
   impl_->SetBlasLtHandle(std::move(func));
 }
+#endif
 
 void GPUContext::SetDnnHandle(dnnHandle_t handle) {
   impl_->SetDnnHandle(handle);
diff --git a/paddle/phi/backends/gpu/gpu_context.h b/paddle/phi/backends/gpu/gpu_context.h
index 2127114de189c..e7b58de9c7c7f 100644
--- a/paddle/phi/backends/gpu/gpu_context.h
+++ b/paddle/phi/backends/gpu/gpu_context.h
@@ -108,8 +108,10 @@ class PADDLE_API GPUContext : public DeviceContext,
   /*! \brief  Return cublas handle in the device context. */
   blasHandle_t cublas_handle() const;
 
+#ifndef PADDLE_WITH_MUSA
   /*! \brief  Return cublasLt handle in the device context. */
   blasLtHandle_t cublaslt_handle() const;
+#endif
 
   /*! \brief  Return cusolver handle in the device context. */
   solverHandle_t cusolver_dn_handle() const;
@@ -232,8 +234,10 @@ class PADDLE_API GPUContext : public DeviceContext,
   void SetBlasTF32Handle(blasHandle_t);
   void SetBlasTF32Handle(std::function<blasHandle_t()>&&);
 
+#ifndef PADDLE_WITH_MUSA
   void SetBlasLtHandle(blasLtHandle_t);
   void SetBlasLtHandle(std::function<blasLtHandle_t()>&&);
+#endif
 
   void SetDnnHandle(dnnHandle_t);
   void SetDnnHandle(std::function<dnnHandle_t()>&&);
diff --git a/paddle/phi/backends/gpu/gpu_decls.h b/paddle/phi/backends/gpu/gpu_decls.h
index 44e9ca2a464f7..1f20fc8b63ede 100644
--- a/paddle/phi/backends/gpu/gpu_decls.h
+++ b/paddle/phi/backends/gpu/gpu_decls.h
@@ -26,7 +26,6 @@ namespace phi {
 #elif defined(PADDLE_WITH_MUSA)
 #define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \
   using GPU_TYPE = MUSA_TYPE;
-
 #else
 #define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \
   using GPU_TYPE = CUDA_TYPE;
@@ -34,43 +33,53 @@ namespace phi {
 
 DECLARE_TYPE_FOR_GPU(gpuStream_t, cudaStream_t, hipStream_t, musaStream_t);
 DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t, musaEvent_t);
-
-// DECLARE_TYPE_FOR_GPU(dnnActivationDescriptor,
-//                      cudnnActivationStruct,
-//                      miopenActivationDescriptor);
-// DECLARE_TYPE_FOR_GPU(dnnTensorDescriptor,
-//                      cudnnTensorStruct,
-//                      miopenTensorDescriptor);
-// DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor,
-//                      cudnnFilterStruct,
-//                      miopenTensorDescriptor);
-// DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor_t,
-//                      cudnnFilterDescriptor_t,
-//                      miopenTensorDescriptor_t);
-// DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor,
-//                      cudnnConvolutionStruct,
-//                      miopenConvolutionDescriptor);
-// DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor_t,
-//                      cudnnConvolutionDescriptor_t,
-//                      miopenConvolutionDescriptor_t);
-// DECLARE_TYPE_FOR_GPU(dnnPoolingDescriptor_t,
-//                      cudnnPoolingDescriptor_t,
-//                      miopenPoolingDescriptor_t);
-// DECLARE_TYPE_FOR_GPU(dnnDropoutDescriptor_t,
-//                      cudnnDropoutDescriptor_t,
-//                      miopenDropoutDescriptor_t);
-
-
-// TODO(Ming Huang): Since there is no blasLt handler,
-// use rocblas_handle for workround.
 // TODO(@caizhi): using correct type: musolverDnHandle_t, musparseHandle_t
-DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle, mublasLtHandle_t);
 DECLARE_TYPE_FOR_GPU(solverHandle_t, cusolverDnHandle_t, rocsolver_handle, musolverDnHandle_t);
 DECLARE_TYPE_FOR_GPU(sparseHandle_t, cusparseHandle_t, rocsparse_handle, musparseHandle_t);
 DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t, mudnnHandle_t);
 DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle, mublasHandle_t);
 #undef DECLARE_TYPE_FOR_GPU
 
+#ifndef PADDLE_WITH_MUSA
+#ifdef PADDLE_WITH_HIP
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
+  using GPU_TYPE = ROCM_TYPE;
+
+#else
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
+  using GPU_TYPE = CUDA_TYPE;
+#endif  // PADDLE_WITH_CDUA
+DECLARE_TYPE_FOR_GPU(dnnActivationDescriptor,
+                     cudnnActivationStruct,
+                     miopenActivationDescriptor);
+DECLARE_TYPE_FOR_GPU(dnnTensorDescriptor,
+                     cudnnTensorStruct,
+                     miopenTensorDescriptor);
+DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor,
+                     cudnnFilterStruct,
+                     miopenTensorDescriptor);
+DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor_t,
+                     cudnnFilterDescriptor_t,
+                     miopenTensorDescriptor_t);
+DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor,
+                     cudnnConvolutionStruct,
+                     miopenConvolutionDescriptor);
+DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor_t,
+                     cudnnConvolutionDescriptor_t,
+                     miopenConvolutionDescriptor_t);
+DECLARE_TYPE_FOR_GPU(dnnPoolingDescriptor_t,
+                     cudnnPoolingDescriptor_t,
+                     miopenPoolingDescriptor_t);
+DECLARE_TYPE_FOR_GPU(dnnDropoutDescriptor_t,
+                     cudnnDropoutDescriptor_t,
+                     miopenDropoutDescriptor_t);
+
+// TODO(Ming Huang): Since there is no blasLt handler,
+// use rocblas_handle for workround.
+DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle);
+#undef DECLARE_TYPE_FOR_GPU
+#endif
+
 using CUDAGraphID = unsigned long long;  // NOLINT
 
 }  // namespace phi
diff --git a/paddle/phi/backends/gpu/gpu_resources.cc b/paddle/phi/backends/gpu/gpu_resources.cc
index 93aa3380ae12b..ed7391c99855a 100644
--- a/paddle/phi/backends/gpu/gpu_resources.cc
+++ b/paddle/phi/backends/gpu/gpu_resources.cc
@@ -296,6 +296,7 @@ void DestroyBlasHandle(blasHandle_t handle) {
 #endif  // PADDLE_WITH_HIP
 }
 
+#ifndef PADDLE_WITH_MUSA
 void InitBlasLtHandle(blasLtHandle_t* blaslt_handle) {
 #if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060
   phi::dynload::cublasLtCreate(blaslt_handle);
@@ -310,6 +311,7 @@ void DestroyBlasLtHandle(blasLtHandle_t handle) {
   }
 #endif
 }
+#endif
 
 void InitDnnHandle(dnnHandle_t* handle, gpuStream_t stream, Place place) {
   if (phi::dynload::HasCUDNN()) {
diff --git a/paddle/phi/backends/gpu/gpu_resources.h b/paddle/phi/backends/gpu/gpu_resources.h
index 7bec5eebf5886..a2fd27f5df0e5 100644
--- a/paddle/phi/backends/gpu/gpu_resources.h
+++ b/paddle/phi/backends/gpu/gpu_resources.h
@@ -35,8 +35,10 @@ void DestoryStream(gpuStream_t stream);
 void InitBlasHandle(blasHandle_t* blas_handle, gpuStream_t stream);
 void DestroyBlasHandle(blasHandle_t handle);
 
+#ifndef PADDLE_WITH_MUSA
 void InitBlasLtHandle(blasLtHandle_t* blaslt_handle);
 void DestroyBlasLtHandle(blasLtHandle_t handle);
+#endif
 
 void InitDnnHandle(dnnHandle_t* handle, gpuStream_t stream, Place place);
 void DestroyDnnHandle(dnnHandle_t handle);

From cdd641f1cd9137a782dc70c0a6121263e97c7c08 Mon Sep 17 00:00:00 2001
From: CaiZhi <zhi.cai@mthreads.com>
Date: Mon, 14 Aug 2023 16:39:37 +0800
Subject: [PATCH 43/55] [MTAI-484] fix(build): fix a bug in
 paddle/phi/backends/device_code.cc

---
 paddle/phi/backends/device_code.cc     | 103 ++++++++++++++++++++++---
 paddle/phi/backends/device_code.h      |   2 +-
 paddle/phi/backends/dynload/musartc.cc |  12 ++-
 paddle/phi/backends/dynload/musartc.h  |  44 ++++++++++-
 4 files changed, 145 insertions(+), 16 deletions(-)

diff --git a/paddle/phi/backends/device_code.cc b/paddle/phi/backends/device_code.cc
index d9cd7268694d4..a3c85c0a1a15f 100644
--- a/paddle/phi/backends/device_code.cc
+++ b/paddle/phi/backends/device_code.cc
@@ -24,9 +24,7 @@ limitations under the License. */
 #include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/flags.h"
-#ifdef PADDLE_WITH_MUSA
-#include <musa.h>
-#endif
+
 PHI_DECLARE_string(cuda_dir);
 
 namespace phi {
@@ -140,7 +138,8 @@ void GPUDeviceCode::CheckAvailableStatus() {
   hiprtcResult nvrtc_result =
       dynload::hiprtcVersion(&nvrtc_major, &nvrtc_minor);
 #elif defined(PADDLE_WITH_MUSA)
-  // TODO(@caizhi): enable dynload module
+  mtrtcResult nvrtc_result =
+      dynload::mtrtcVersion(&nvrtc_major, &nvrtc_minor);
 #else
   nvrtcResult nvrtc_result = dynload::nvrtcVersion(&nvrtc_major, &nvrtc_minor);
 #endif
@@ -168,8 +167,7 @@ void GPUDeviceCode::CheckAvailableStatus() {
 #ifdef PADDLE_WITH_HIP
   if (nvrtc_result != HIPRTC_SUCCESS || driver_result != hipSuccess) {
 #elif defined(PADDLE_WITH_MUSA)
-  // TODO(@caizhi): enable dynload module
-  if (false) {
+  if (nvrtc_result != MTRTC_SUCCESS || driver_result != MUSA_SUCCESS) {
 #else
   if (nvrtc_result != NVRTC_SUCCESS || driver_result != CUDA_SUCCESS) {
 #endif
@@ -343,11 +341,85 @@ bool GPUDeviceCode::Compile(bool include_path) {
     return false;
   }
 #elif defined(PADDLE_WITH_MUSA)
-  // TODO(@caizhi): enable dynload module
+  mtrtcProgram program;
+  if (!CheckNVRTCResult(dynload::mtrtcCreateProgram(&program,
+                                                    kernel_.c_str(),  // buffer
+                                                    name_.c_str(),    // name
+                                                    0,         // numHeaders
+                                                    nullptr,   // headers
+                                                    nullptr),  // includeNames
+                        "mtrtcCreateProgram")) {
+    return false;
+  }
+
+  // Compile the program for specified compute_capability
   auto* dev_ctx = reinterpret_cast<phi::GPUContext*>(
       DeviceContextPool::Instance().Get(place_));
-  is_compiled_ = false;
-  return false;
+  int compute_capability = dev_ctx->GetComputeCapability();
+  std::string compute_flag =
+      "--gpu-architecture=compute_" + std::to_string(compute_capability);
+  std::vector<const char*> options = {"--std=c++11", compute_flag.c_str()};
+  std::string include_option;
+  if (include_path) {
+    std::string cuda_include_path = FindCUDAIncludePath();
+    if (!cuda_include_path.empty()) {
+      include_option = "--include-path=" + cuda_include_path;
+      options.push_back(include_option.c_str());
+    }
+  }
+  mtrtcResult compile_result =
+      dynload::mtrtcCompileProgram(program,          // program
+                                   options.size(),   // numOptions
+                                   options.data());  // options
+  if (compile_result == MTRTC_ERROR_COMPILATION) {
+    // Obtain compilation log from the program
+    size_t log_size;
+    if (!CheckNVRTCResult(dynload::mtrtcGetProgramLogSize(program, &log_size),
+                          "mtrtcGetProgramLogSize")) {
+      return false;
+    }
+    std::vector<char> log;
+    log.resize(log_size + 1);
+    if (!CheckNVRTCResult(dynload::mtrtcGetProgramLog(program, log.data()),
+                          "nvrtcGetProgramLog")) {
+      return false;
+    }
+    LOG(WARNING) << "JIT compiling of MUSA code failed:"
+                 << "\n  Kernel name: " << name_ << "\n  Kernel body:\n"
+                 << kernel_ << "\n  Compiling log: " << log.data();
+
+    return false;
+  }
+
+  // Obtain PTX from the program
+  size_t ptx_size;
+  if (!CheckNVRTCResult(dynload::mtrtcGetMUSASize(program, &ptx_size),
+                        "mtrtcGetMUSASize")) {
+    return false;
+  }
+  ptx_.resize(ptx_size + 1);
+  if (!CheckNVRTCResult(dynload::mtrtcGetMUSA(program, ptx_.data()),
+                        "mtrtcGetMUSA")) {
+    return false;
+  }
+
+  if (!CheckNVRTCResult(dynload::mtrtcDestroyProgram(&program),
+                        "mtrtcDestroyProgram")) {
+    return false;
+  }
+
+  if (!CheckCUDADriverResult(dynload::muModuleLoadData(&module_, ptx_.data()),
+                             "muModuleLoadData",
+                             name_)) {
+    return false;
+  }
+
+  if (!CheckCUDADriverResult(
+          dynload::muModuleGetFunction(&function_, module_, name_.c_str()),
+          "muModuleGetFunction",
+          name_)) {
+    return false;
+  }
 #else
   nvrtcProgram program;
   if (!CheckNVRTCResult(dynload::nvrtcCreateProgram(&program,
@@ -512,8 +584,15 @@ bool GPUDeviceCode::CheckNVRTCResult(hiprtcResult result,
   }
   return true;
 }
-#endif
-#ifdef PADDLE_WITH_CUDA
+#elif defined(PADDLE_WITH_MUSA)
+bool GPUDeviceCode::CheckNVRTCResult(mtrtcResult result, std::string function) {
+  if (result != MTRTC_SUCCESS) {
+    LOG_FIRST_N(WARNING, 1)
+        << "Call " << function << " for < " << name_
+        << " > failed: " << dynload::mtrtcGetErrorString(result);
+    return false;
+  }
+#else
 bool GPUDeviceCode::CheckNVRTCResult(nvrtcResult result, std::string function) {
   if (result != NVRTC_SUCCESS) {
     LOG_FIRST_N(WARNING, 1)
@@ -521,9 +600,9 @@ bool GPUDeviceCode::CheckNVRTCResult(nvrtcResult result, std::string function) {
         << " > failed: " << dynload::nvrtcGetErrorString(result);
     return false;
   }
+#endif
   return true;
 }
 #endif
-#endif
 
 }  // namespace phi
diff --git a/paddle/phi/backends/device_code.h b/paddle/phi/backends/device_code.h
index 5721f8f04768e..f9cb49dc91474 100644
--- a/paddle/phi/backends/device_code.h
+++ b/paddle/phi/backends/device_code.h
@@ -73,7 +73,7 @@ class GPUDeviceCode : public DeviceCode {
 #ifdef PADDLE_WITH_HIP
   bool CheckNVRTCResult(hiprtcResult result, std::string function);
 #elif defined(PADDLE_WITH_MUSA)
-
+  bool CheckNVRTCResult(mtrtcResult result, std::string function);
 #else
   bool CheckNVRTCResult(nvrtcResult result, std::string function);
 #endif
diff --git a/paddle/phi/backends/dynload/musartc.cc b/paddle/phi/backends/dynload/musartc.cc
index cf14ae70a01a1..9cd25270a1016 100644
--- a/paddle/phi/backends/dynload/musartc.cc
+++ b/paddle/phi/backends/dynload/musartc.cc
@@ -12,12 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/phi/backends/dynload/musartc.h"
 
 namespace phi {
 namespace dynload {
 
+std::once_flag musartc_dso_flag;
+void* musartc_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+MUSARTC_ROUTINE_EACH(DEFINE_WRAP);
+
 bool HasNVRTC() {
-  return false;
+  std::call_once(musartc_dso_flag,
+                 []() { musartc_dso_handle = GetNVRTCDsoHandle(); });
+  return musartc_dso_handle != nullptr;
 }
 
 }  // namespace dynload
diff --git a/paddle/phi/backends/dynload/musartc.h b/paddle/phi/backends/dynload/musartc.h
index dc9ebc3faf0d7..7362fa61fc95c 100644
--- a/paddle/phi/backends/dynload/musartc.h
+++ b/paddle/phi/backends/dynload/musartc.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,11 +14,51 @@ limitations under the License. */
 
 #pragma once
 
+#include <mtrtc.h>
+
+#include <mutex>  // NOLINT
+
+#include "paddle/phi/backends/dynload/dynamic_loader.h"
+#include "paddle/phi/backends/dynload/port.h"
+
 namespace phi {
 namespace dynload {
 
+extern std::once_flag musartc_dso_flag;
+extern void* musartc_dso_handle;
 extern bool HasNVRTC();
 
+#define DECLARE_DYNAMIC_LOAD_NVRTC_WRAP(__name)                        \
+  struct DynLoad__##__name {                                           \
+    template <typename... Args>                                        \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {   \
+      using musartc_func = decltype(&::__name);                        \
+      std::call_once(musartc_dso_flag, []() {                          \
+        musartc_dso_handle = phi::dynload::GetNVRTCDsoHandle();        \
+      });                                                              \
+      static void* p_##__name = dlsym(musartc_dso_handle, #__name);    \
+      return reinterpret_cast<musartc_func>(p_##__name)(args...);      \
+    }                                                                  \
+  };                                                                   \
+  extern struct DynLoad__##__name __name
+
+/**
+ * include all needed musartc functions
+ **/
+#define MUSARTC_ROUTINE_EACH(__macro) \
+  __macro(mtrtcVersion);              \
+  __macro(mtrtcGetErrorString);       \
+  __macro(mtrtcCompileProgram);       \
+  __macro(mtrtcCreateProgram);        \
+  __macro(mtrtcDestroyProgram);       \
+  __macro(mtrtcGetMUSA);              \
+  __macro(mtrtcGetMUSASize);          \
+  __macro(mtrtcGetProgramLog);        \
+  __macro(mtrtcGetProgramLogSize)
+
+MUSARTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVRTC_WRAP);
+
+#undef DECLARE_DYNAMIC_LOAD_NVRTC_WRAP
+
 }  // namespace dynload
 }  // namespace phi
-

From f830d3a2fb68895e991ce292ae8ea24e5123d1a0 Mon Sep 17 00:00:00 2001
From: CaiZhi <zhi.cai@mthreads.com>
Date: Mon, 14 Aug 2023 16:56:20 +0800
Subject: [PATCH 44/55] [MTAI-484] fix(build): disable cusolverDn for MUSA

---
 paddle/fluid/inference/api/analysis_predictor.cc |  2 ++
 paddle/fluid/inference/api/infer_context.h       |  2 ++
 paddle/fluid/inference/api/resource_manager.cc   |  6 +++---
 paddle/fluid/inference/api/resource_manager.h    |  8 ++++----
 paddle/phi/backends/gpu/forwards.h               |  2 --
 paddle/phi/backends/gpu/gpu_context.cc           | 10 +++++++++-
 paddle/phi/backends/gpu/gpu_context.h            |  4 +++-
 paddle/phi/backends/gpu/gpu_decls.h              |  3 +--
 paddle/phi/backends/gpu/gpu_resources.cc         |  2 ++
 paddle/phi/backends/gpu/gpu_resources.h          |  2 ++
 10 files changed, 28 insertions(+), 13 deletions(-)

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 45941a01f157b..26d4fa79a34e0 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -131,7 +131,9 @@ void UpdatePrivateDeviceContext(InferGPUContext *gpu_context,
   gpu_context->SetBlasTF32Handle(
       gpu_resource->GetBlasTF32TensorCoreHandleCreator());
   gpu_context->SetDnnHandle(gpu_resource->GetDnnHandleCreator());
+#ifndef PADDLE_WITH_MUSA
   gpu_context->SetSolverHandle(gpu_resource->GetSolverDnHandleCreator());
+#endif
   gpu_context->SetSparseHandle(gpu_resource->GetSparseHandleCreator());
   gpu_context->SetEigenDevice(gpu_resource->GetGpuEigenDevice());
 
diff --git a/paddle/fluid/inference/api/infer_context.h b/paddle/fluid/inference/api/infer_context.h
index 19f285ad78b65..c0357a7236a80 100644
--- a/paddle/fluid/inference/api/infer_context.h
+++ b/paddle/fluid/inference/api/infer_context.h
@@ -35,7 +35,9 @@ class InferGPUContext : public phi::GPUContext {
   using phi::GPUContext::SetBlasTF32Handle;
   using phi::GPUContext::SetDnnHandle;
   using phi::GPUContext::SetEigenDevice;
+#ifndef PADDLE_WITH_MUSA
   using phi::GPUContext::SetSolverHandle;
+#endif
   using phi::GPUContext::SetSparseHandle;
   using phi::GPUContext::SetStream;
   // using phi::GPUContext::SetDnnWorkspaceHandle;
diff --git a/paddle/fluid/inference/api/resource_manager.cc b/paddle/fluid/inference/api/resource_manager.cc
index 0efa4757b6f57..d265f29db314c 100644
--- a/paddle/fluid/inference/api/resource_manager.cc
+++ b/paddle/fluid/inference/api/resource_manager.cc
@@ -173,8 +173,8 @@ void GPUContextResource::DestroyGPUResource() {
   DestroyBlasHandle();
 #ifndef PADDLE_WITH_MUSA
   DestroyBlasLtHandle();
-#endif
   DestroySolverHandle();
+#endif
   DestroySparseHandle();
 }
 
@@ -219,7 +219,6 @@ void GPUContextResource::InitBlasLtHandle() {
 void GPUContextResource::DestroyBlasLtHandle() {
   phi::DestroyBlasLtHandle(blaslt_handle_);
 }
-#endif
 
 void GPUContextResource::InitSolverHandle() {
   phi::InitSolverHandle(&solver_handle_, stream_);
@@ -228,6 +227,7 @@ void GPUContextResource::InitSolverHandle() {
 void GPUContextResource::DestroySolverHandle() {
   phi::DestroySolverHandle(solver_handle_);
 }
+#endif
 
 void GPUContextResource::InitSparseHandle() {
   phi::InitSparseHandle(&sparse_handle_, stream_);
@@ -307,7 +307,6 @@ GPUContextResource::GetBlasLtHandleCreator() {
     return blaslt_handle_;
   };
 }
-#endif
 
 phi::solverHandle_t GPUContextResource::GetSolverDnHandle() const {
   return solver_handle_;
@@ -320,6 +319,7 @@ GPUContextResource::GetSolverDnHandleCreator() {
     return solver_handle_;
   };
 }
+#endif
 
 phi::sparseHandle_t GPUContextResource::GetSparseHandle() const {
   return sparse_handle_;
diff --git a/paddle/fluid/inference/api/resource_manager.h b/paddle/fluid/inference/api/resource_manager.h
index f8da0eff441bd..99198c6e7b4e6 100644
--- a/paddle/fluid/inference/api/resource_manager.h
+++ b/paddle/fluid/inference/api/resource_manager.h
@@ -62,8 +62,8 @@ class GPUContextResource {
   std::function<phi::blasHandle_t()> GetBlasTF32TensorCoreHandleCreator();
 #ifndef PADDLE_WITH_MUSA
   std::function<phi::blasLtHandle_t()> GetBlasLtHandleCreator();
-#endif
   std::function<phi::solverHandle_t()> GetSolverDnHandleCreator();
+#endif
   std::function<phi::sparseHandle_t()> GetSparseHandleCreator();
   std::function<Eigen::GpuDevice*()> GetGpuEigenDeviceCreator();
 
@@ -74,8 +74,8 @@ class GPUContextResource {
   blasHandle_t GetBlasTF32Handle() const;
 #ifndef PADDLE_WITH_MUSA
   blasLtHandle_t GetBlasLtHandle() const;
-#endif
   phi::solverHandle_t GetSolverDnHandle() const;
+#endif
   phi::sparseHandle_t GetSparseHandle() const;
   Eigen::GpuDevice* GetGpuEigenDevice() const;
   int GetGpuComputeCapability() const;
@@ -97,9 +97,9 @@ class GPUContextResource {
 #ifndef PADDLE_WITH_MUSA
   void InitBlasLtHandle();
   void DestroyBlasLtHandle();
-#endif
   void InitSolverHandle();
   void DestroySolverHandle();
+#endif
   void InitSparseHandle();
   void DestroySparseHandle();
 
@@ -124,9 +124,9 @@ class GPUContextResource {
   blasHandle_t blas_tf32_tensor_core_handle_{nullptr};
 #ifndef PADDLE_WITH_MUSA
   blasLtHandle_t blaslt_handle_{nullptr};
+  phi::solverHandle_t solver_handle_{nullptr};
 #endif
   dnnHandle_t dnn_handle_{nullptr};
-  phi::solverHandle_t solver_handle_{nullptr};
   phi::sparseHandle_t sparse_handle_{nullptr};
   // DnnWorkspaceHandle
 };
diff --git a/paddle/phi/backends/gpu/forwards.h b/paddle/phi/backends/gpu/forwards.h
index 441609102d4ea..e84e000ba5e66 100644
--- a/paddle/phi/backends/gpu/forwards.h
+++ b/paddle/phi/backends/gpu/forwards.h
@@ -77,8 +77,6 @@ using musaStream_t = struct MUstream_st *;
 using musaEvent_t = struct MUevent_st *;
 using mublasHandle_t = struct _mublasHandle_t*;
 using mudnnHandle_t = class Handle*;
-// TODO(@caizhi): using correct type
-using musolverDnHandle_t = bool**;
 using musparseHandle_t = struct _musparse_handle*;
 
 /// Forward declaration of ROCM types.
diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc
index 5940d8966f6bc..f416c4e65a831 100644
--- a/paddle/phi/backends/gpu/gpu_context.cc
+++ b/paddle/phi/backends/gpu/gpu_context.cc
@@ -275,7 +275,9 @@ struct GPUContext::Impl {
       DestoryInternalWorkspace();
       DestoryInternalEigenDevice();
       phi::DestroySparseHandle(sparse_handle_);
+#ifndef PADDLE_WITH_MUSA
       phi::DestroySolverHandle(solver_handle_);
+#endif
       phi::DestroyDnnHandle(dnn_handle_);
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
       if (nccl_comm_) {
@@ -511,6 +513,7 @@ struct GPUContext::Impl {
     dnn_handle_creator_ = std::move(handle_creator);
   }
 
+#ifndef PADDLE_WITH_MUSA
   solverHandle_t GetSolverHandle() {
     std::call_once(flag_slover_, [&]() {
       if (!solver_handle_) {
@@ -530,6 +533,7 @@ struct GPUContext::Impl {
   void SetSolverHandle(std::function<solverHandle_t()>&& handle_creator) {
     solver_handle_creator_ = std::move(handle_creator);
   }
+#endif
 
   sparseHandle_t GetSparseHandle() {
     std::call_once(flag_sparse_, [&]() {
@@ -829,8 +833,10 @@ struct GPUContext::Impl {
 #endif
   dnnHandle_t dnn_handle_{nullptr};
   std::function<dnnHandle_t()> dnn_handle_creator_{nullptr};
+#ifndef PADDLE_WITH_MUSA
   solverHandle_t solver_handle_{nullptr};
   std::function<solverHandle_t()> solver_handle_creator_{nullptr};
+#endif
   sparseHandle_t sparse_handle_{nullptr};
   std::function<sparseHandle_t()> sparse_handle_creator_{nullptr};
   DnnWorkspaceHandle* workspace_{nullptr};
@@ -904,11 +910,11 @@ blasHandle_t GPUContext::cublas_handle() const {
 blasLtHandle_t GPUContext::cublaslt_handle() const {
   return impl_->GetBlasLtHandle();
 }
-#endif
 
 solverHandle_t GPUContext::cusolver_dn_handle() const {
   return impl_->GetSolverHandle();
 }
+#endif
 
 sparseHandle_t GPUContext::cusparse_handle() const {
   return impl_->GetSparseHandle();
@@ -1046,6 +1052,7 @@ void GPUContext::SetDnnHandle(std::function<dnnHandle_t()>&& func) {
   impl_->SetDnnHandle(std::move(func));
 }
 
+#ifndef PADDLE_WITH_MUSA
 void GPUContext::SetSolverHandle(solverHandle_t handle) {
   impl_->SetSolverHandle(handle);
 }
@@ -1053,6 +1060,7 @@ void GPUContext::SetSolverHandle(solverHandle_t handle) {
 void GPUContext::SetSolverHandle(std::function<solverHandle_t()>&& func) {
   impl_->SetSolverHandle(std::move(func));
 }
+#endif
 
 void GPUContext::SetSparseHandle(sparseHandle_t handle) {
   impl_->SetSparseHandle(handle);
diff --git a/paddle/phi/backends/gpu/gpu_context.h b/paddle/phi/backends/gpu/gpu_context.h
index e7b58de9c7c7f..5ce1d87e5f36a 100644
--- a/paddle/phi/backends/gpu/gpu_context.h
+++ b/paddle/phi/backends/gpu/gpu_context.h
@@ -111,10 +111,10 @@ class PADDLE_API GPUContext : public DeviceContext,
 #ifndef PADDLE_WITH_MUSA
   /*! \brief  Return cublasLt handle in the device context. */
   blasLtHandle_t cublaslt_handle() const;
-#endif
 
   /*! \brief  Return cusolver handle in the device context. */
   solverHandle_t cusolver_dn_handle() const;
+#endif
 
   /*! \brief  Return cusparse handle in the device context. */
   sparseHandle_t cusparse_handle() const;
@@ -242,8 +242,10 @@ class PADDLE_API GPUContext : public DeviceContext,
   void SetDnnHandle(dnnHandle_t);
   void SetDnnHandle(std::function<dnnHandle_t()>&&);
 
+#ifndef PADDLE_WITH_MUSA
   void SetSolverHandle(solverHandle_t);
   void SetSolverHandle(std::function<solverHandle_t()>&&);
+#endif
 
   void SetSparseHandle(sparseHandle_t);
   void SetSparseHandle(std::function<sparseHandle_t()>&&);
diff --git a/paddle/phi/backends/gpu/gpu_decls.h b/paddle/phi/backends/gpu/gpu_decls.h
index 1f20fc8b63ede..222d6eefce283 100644
--- a/paddle/phi/backends/gpu/gpu_decls.h
+++ b/paddle/phi/backends/gpu/gpu_decls.h
@@ -33,8 +33,6 @@ namespace phi {
 
 DECLARE_TYPE_FOR_GPU(gpuStream_t, cudaStream_t, hipStream_t, musaStream_t);
 DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t, musaEvent_t);
-// TODO(@caizhi): using correct type: musolverDnHandle_t, musparseHandle_t
-DECLARE_TYPE_FOR_GPU(solverHandle_t, cusolverDnHandle_t, rocsolver_handle, musolverDnHandle_t);
 DECLARE_TYPE_FOR_GPU(sparseHandle_t, cusparseHandle_t, rocsparse_handle, musparseHandle_t);
 DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t, mudnnHandle_t);
 DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle, mublasHandle_t);
@@ -77,6 +75,7 @@ DECLARE_TYPE_FOR_GPU(dnnDropoutDescriptor_t,
 // TODO(Ming Huang): Since there is no blasLt handler,
 // use rocblas_handle for workround.
 DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle);
+DECLARE_TYPE_FOR_GPU(solverHandle_t, cusolverDnHandle_t, rocsolver_handle);
 #undef DECLARE_TYPE_FOR_GPU
 #endif
 
diff --git a/paddle/phi/backends/gpu/gpu_resources.cc b/paddle/phi/backends/gpu/gpu_resources.cc
index ed7391c99855a..01c307bd9e29e 100644
--- a/paddle/phi/backends/gpu/gpu_resources.cc
+++ b/paddle/phi/backends/gpu/gpu_resources.cc
@@ -379,6 +379,7 @@ void DestroyDnnHandle(dnnHandle_t handle) {
 #endif  // PADDLE_WITH_HIP
 }
 
+#ifndef PADDLE_WITH_MUSA
 void InitSolverHandle(solverHandle_t* handle, gpuStream_t stream) {
 #ifdef PADDLE_WITH_CUDA
   PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cusolverDnCreate(handle));
@@ -394,6 +395,7 @@ void DestroySolverHandle(solverHandle_t solver_handle) {
   }
 #endif
 }
+#endif
 
 void InitSparseHandle(sparseHandle_t* handle, gpuStream_t stream) {
 // ROCM is not yet supported
diff --git a/paddle/phi/backends/gpu/gpu_resources.h b/paddle/phi/backends/gpu/gpu_resources.h
index a2fd27f5df0e5..16d63910b8f4a 100644
--- a/paddle/phi/backends/gpu/gpu_resources.h
+++ b/paddle/phi/backends/gpu/gpu_resources.h
@@ -43,8 +43,10 @@ void DestroyBlasLtHandle(blasLtHandle_t handle);
 void InitDnnHandle(dnnHandle_t* handle, gpuStream_t stream, Place place);
 void DestroyDnnHandle(dnnHandle_t handle);
 
+#ifndef PADDLE_WITH_MUSA
 void InitSolverHandle(solverHandle_t* handle, gpuStream_t stream);
 void DestroySolverHandle(solverHandle_t solver_handle);
+#endif
 
 void InitSparseHandle(sparseHandle_t* handle, gpuStream_t stream);
 void DestroySparseHandle(sparseHandle_t handle);

From d485fb50a0fc7975b9fd80da0b7a63a9941ec240 Mon Sep 17 00:00:00 2001
From: CaiZhi <zhi.cai@mthreads.com>
Date: Mon, 14 Aug 2023 20:45:20 +0800
Subject: [PATCH 45/55] [MTAI-484] feat(build): fix a bug in softmax.cu for
 MUSA

---
 paddle/phi/kernels/CMakeLists.txt       |  2 ++
 paddle/phi/kernels/funcs/CMakeLists.txt |  3 ++-
 paddle/phi/kernels/funcs/softmax.cu     | 14 ++++----------
 3 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index 012ca2b6ad088..2effc6428448a 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -88,6 +88,8 @@ if(WITH_MUSA)
        "gpu/rnn_grad_kernel.cu.cc"
        "gpu/rnn_kernel.cu.cc"
        "gpu/slogdeterminant_grad_kernel.cu"
+       "gpu/softmax_grad_kernel.cu"
+       "gpu/softmax_kernel.cu"
        "gpu/solve_grad_kernel.cu"
        "gpu/solve_kernel.cu"
        "gpu/spectral_norm_grad_kernel.cu"
diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt
index 7b1e010064259..cffb97d84050c 100644
--- a/paddle/phi/kernels/funcs/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/CMakeLists.txt
@@ -20,7 +20,8 @@ endif()
 if(WITH_MUSA)
   list(REMOVE_ITEM func_cu_srcs
       "cross_entropy.cu"
-      "gru_compute.cu")
+      "gru_compute.cu"
+      "softmax.cu")
 endif()
 
 collect_srcs(kernels_srcs SRCS ${func_cc_srcs} ${func_cu_srcs})
diff --git a/paddle/phi/kernels/funcs/softmax.cu b/paddle/phi/kernels/funcs/softmax.cu
index 9e7cf84273b04..11ee9c23fa2ff 100644
--- a/paddle/phi/kernels/funcs/softmax.cu
+++ b/paddle/phi/kernels/funcs/softmax.cu
@@ -21,8 +21,6 @@ limitations under the License. */
 
 namespace phi {
 namespace funcs {
-// TODO(@caizhi): enable it
-#if 0
 using ScopedTensorDescriptor = phi::backends::gpu::ScopedTensorDescriptor;
 using DataLayout = phi::backends::gpu::DataLayout;
 template <typename T>
@@ -61,8 +59,6 @@ void SoftmaxCUDNNFunctor<T, DeviceContext>::operator()(
                                             context.template Alloc<T>(Y),
                                             MIOPEN_SOFTMAX_ACCURATE,
                                             MIOPEN_SOFTMAX_MODE_INSTANCE));
-#elif defined(PADDLE_WITH_MUSA)
-  // TODO
 #else
   cudnnTensorDescriptor_t cudnn_x_desc =
       xDesc.descriptor<T>(layout, cudnn_tensor_dims);
@@ -148,18 +144,16 @@ template class SoftmaxCUDNNFunctor<float, phi::GPUContext>;
 template class SoftmaxCUDNNFunctor<phi::dtype::float16, phi::GPUContext>;
 template class SoftmaxGradCUDNNFunctor<float, phi::GPUContext>;
 template class SoftmaxGradCUDNNFunctor<phi::dtype::float16, phi::GPUContext>;
-// TODO(@caizhi): enable it
-//#if CUDNN_VERSION_MIN(8, 1, 0)
-//template class SoftmaxCUDNNFunctor<phi::dtype::bfloat16, phi::GPUContext>;
-//template class SoftmaxGradCUDNNFunctor<phi::dtype::bfloat16, phi::GPUContext>;
-//#endif
+#if CUDNN_VERSION_MIN(8, 1, 0)
+template class SoftmaxCUDNNFunctor<phi::dtype::bfloat16, phi::GPUContext>;
+template class SoftmaxGradCUDNNFunctor<phi::dtype::bfloat16, phi::GPUContext>;
+#endif
 
 // MIOPEN do not support double
 #ifndef PADDLE_WITH_HIP
 template class SoftmaxCUDNNFunctor<double, phi::GPUContext>;
 template class SoftmaxGradCUDNNFunctor<double, phi::GPUContext>;
 #endif
-#endif 
 template class SoftmaxFunctor<phi::GPUContext, phi::dtype::float16>;
 template class SoftmaxFunctor<phi::GPUContext, phi::dtype::bfloat16>;
 template class SoftmaxFunctor<phi::GPUContext, float>;

From 1405ca09170be9d22b5bc6d3a3c2c800feffb778 Mon Sep 17 00:00:00 2001
From: CaiZhi <zhi.cai@mthreads.com>
Date: Mon, 14 Aug 2023 20:49:27 +0800
Subject: [PATCH 46/55] [MTAI-484] fix(build): fix a bug in enforce.h for MUSA

---
 paddle/phi/backends/gpu/gpu_context.cc        |   9 +-
 paddle/phi/backends/gpu/gpu_types.h           |  31 ++--
 paddle/phi/core/enforce.h                     | 148 ++++++++++++++----
 .../gpudnn/conv_transpose_grad_kernel.cu      |  11 +-
 .../kernels/gpudnn/conv_transpose_kernel.cu   |  18 +--
 paddle/phi/kernels/gpudnn/pool_grad_kernel.cu |  20 +--
 paddle/phi/kernels/gpudnn/pool_kernel.cu      |  30 +---
 .../kernels/sparse/gpu/matmul_grad_kernel.cu  |   8 +-
 8 files changed, 165 insertions(+), 110 deletions(-)

diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc
index 6fa9cb6a21460..514f49fba0e45 100644
--- a/paddle/phi/backends/gpu/gpu_context.cc
+++ b/paddle/phi/backends/gpu/gpu_context.cc
@@ -491,9 +491,7 @@ struct GPUContext::Impl {
       PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDestroy(dnn_handle_));
       dnn_handle_ = nullptr;
     }
-#elif defined(PADDLE_WITH_MUSA)
-
-#else
+#elif defined(PADDLE_WITH_CUDA)
     if (owned_ && dnn_handle_ != nullptr) {
       PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDestroy(dnn_handle_));
       dnn_handle_ = nullptr;
@@ -756,13 +754,8 @@ struct GPUContext::Impl {
 #endif
 
 #ifdef PADDLE_WITH_MUSA
-#if MUSA_VERSION >= 10000
     PADDLE_ENFORCE_GPU_SUCCESS(
         musaLaunchHostFunc(stream(), internal::StreamCallbackFunc, func));
-#else
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        musaStreamAddCallback(stream(), internal::StreamCallbackFunc, func, 0));
-#endif
 #endif
   }
 
diff --git a/paddle/phi/backends/gpu/gpu_types.h b/paddle/phi/backends/gpu/gpu_types.h
index 31e32fa787d72..9ff900a1b96ff 100644
--- a/paddle/phi/backends/gpu/gpu_types.h
+++ b/paddle/phi/backends/gpu/gpu_types.h
@@ -25,7 +25,6 @@
 #elif defined(PADDLE_WITH_MUSA)
 #include "paddle/phi/backends/dynload/mublas.h"
 #include "paddle/phi/backends/dynload/mudnn.h"
-#include <musa_runtime.h>
 #else  // PADDLE_WITH_CUDA
 #include "paddle/phi/backends/dynload/cublas.h"
 #include "paddle/phi/backends/dynload/cudnn.h"
@@ -49,17 +48,29 @@ namespace phi {
 DECLARE_TYPE_FOR_GPU(gpuError_t, cudaError_t, hipError_t, musaError_t);
 DECLARE_TYPE_FOR_GPU(gpuMemcpyKind, cudaMemcpyKind, hipMemcpyKind, musaMemcpyKind);
 DECLARE_TYPE_FOR_GPU(gpuDeviceProp, cudaDeviceProp, hipDeviceProp_t, musaDeviceProp);
-// TODO(@caizhi): 
-// DECLARE_TYPE_FOR_GPU(dnnDataType_t, cudnnDataType_t, miopenDataType_t);
-// DECLARE_TYPE_FOR_GPU(dnnPoolingMode_t, cudnnPoolingMode_t, miopenPoolingMode_t);
-// DECLARE_TYPE_FOR_GPU(dnnTensorFormat_t,
-//                      cudnnTensorFormat_t,
-//                      miopenTensorFormat_t);
-// DECLARE_TYPE_FOR_GPU(dnnActivationMode_t,
-//                      cudnnActivationMode_t,
-//                      miopenActivationMode_t);
+#undef DECLARE_TYPE_FOR_GPU
+
+#ifndef PADDLE_WITH_MUSA
+#ifdef PADDLE_WITH_HIP
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
+  using GPU_TYPE = ROCM_TYPE;
+
+#else  // PADDLE_WITH_MUSA
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
+  using GPU_TYPE = CUDA_TYPE;
+#endif  // PADDLE_WITH_CUDA
 
+DECLARE_TYPE_FOR_GPU(dnnDataType_t, cudnnDataType_t, miopenDataType_t);
+DECLARE_TYPE_FOR_GPU(dnnPoolingMode_t, cudnnPoolingMode_t, miopenPoolingMode_t);
+DECLARE_TYPE_FOR_GPU(dnnTensorFormat_t,
+                     cudnnTensorFormat_t,
+                     miopenTensorFormat_t);
+DECLARE_TYPE_FOR_GPU(dnnActivationMode_t,
+                     cudnnActivationMode_t,
+                     miopenActivationMode_t);
 #undef DECLARE_TYPE_FOR_GPU
+#endif
+
 
 #ifdef PADDLE_WITH_HIP
 #define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \
diff --git a/paddle/phi/core/enforce.h b/paddle/phi/core/enforce.h
index fac512ea1c31a..6916703a9684c 100644
--- a/paddle/phi/core/enforce.h
+++ b/paddle/phi/core/enforce.h
@@ -43,6 +43,7 @@ limitations under the License. */
 #include <musparse.h>
 #include <thrust/system/musa/error.h>
 #include <thrust/system_error.h>
+using mudnnStatus_t = ::musa::dnn::Status;
 #endif  // PADDLE_WITH_MUSA
 
 #ifdef PADDLE_WITH_HIP
@@ -881,13 +882,12 @@ struct ExternalApiType {};
   }
 
 DEFINE_EXTERNAL_API_TYPE(musaError_t, musaSuccess);
-//DEFINE_EXTERNAL_API_TYPE(murandStatus_t, MURAND_STATUS_SUCCESS);
-//DEFINE_EXTERNAL_API_TYPE(cudnnStatus_t, CUDNN_STATUS_SUCCESS);
+DEFINE_EXTERNAL_API_TYPE(murandStatus_t, MURAND_STATUS_SUCCESS);
+DEFINE_EXTERNAL_API_TYPE(mudnnStatus_t, ::musa::dnn::Status::SUCCESS);
 DEFINE_EXTERNAL_API_TYPE(mublasStatus_t, MUBLAS_STATUS_SUCCESS);
-//DEFINE_EXTERNAL_API_TYPE(cusparseStatus_t, CUSPARSE_STATUS_SUCCESS);
-//DEFINE_EXTERNAL_API_TYPE(cusolverStatus_t, CUSOLVER_STATUS_SUCCESS);
-//DEFINE_EXTERNAL_API_TYPE(cufftResult_t, CUFFT_SUCCESS);
-//DEFINE_EXTERNAL_API_TYPE(CUresult, CUDA_SUCCESS);
+DEFINE_EXTERNAL_API_TYPE(musparseStatus_t, MUSPARSE_STATUS_SUCCESS);
+DEFINE_EXTERNAL_API_TYPE(mufftResult_t, MUFFT_SUCCESS);
+DEFINE_EXTERNAL_API_TYPE(MUresult, MUSA_SUCCESS);
 
 #if !defined(__APPLE__) && defined(PADDLE_WITH_MCCL)
 DEFINE_EXTERNAL_API_TYPE(mcclResult_t, mcclSuccess);
@@ -904,38 +904,132 @@ inline std::string build_musa_error_msg(musaError_t e) {
   return sout.str();
 }
 
-///*************** MURAND ERROR ***************/
-//inline bool is_error(murandStatus_t stat) {
-//  return stat != MURAND_STATUS_SUCCESS;
-//}
-//
-//inline std::string build_musa_error_msg(murandStatus_t stat) {
-//  std::ostringstream sout;
-//  sout << "MURAND error(" << stat << "). " << GetExternalErrorMsg(stat);
-//  return sout.str();
-//}
+/*************** MURAND ERROR ***************/
+inline bool is_error(murandStatus_t stat) {
+  return stat != MURAND_STATUS_SUCCESS;
+}
+
+inline const char* murandGetErrorString(murandStatus_t stat) {
+  switch (stat) {
+    case MURAND_STATUS_SUCCESS:
+      return "MURAND_STATUS_SUCCESS";
+    case MURAND_STATUS_VERSION_MISMATCH:
+      return "MURAND_STATUS_VERSION_MISMATCH";
+    case MURAND_STATUS_NOT_CREATED:
+      return "MURAND_STATUS_NOT_CREATED";
+    case MURAND_STATUS_ALLOCATION_FAILED:
+      return "MURAND_STATUS_ALLOCATION_FAILED";
+    case MURAND_STATUS_TYPE_ERROR:
+      return "MURAND_STATUS_TYPE_ERROR";
+    case MURAND_STATUS_OUT_OF_RANGE:
+      return "MURAND_STATUS_OUT_OF_RANGE";
+    case MURAND_STATUS_LENGTH_NOT_MULTIPLE:
+      return "MURAND_STATUS_LENGTH_NOT_MULTIPLE";
+    case MURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
+      return "MURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
+    case MURAND_STATUS_LAUNCH_FAILURE:
+      return "MURAND_STATUS_LAUNCH_FAILURE";
+    case MURAND_STATUS_INTERNAL_ERROR:
+      return "MURAND_STATUS_INTERNAL_ERROR";
+    case MURAND_STATUS_NOT_IMPLEMENTED:
+      return "MURAND_STATUS_NOT_IMPLEMENTED";
+    default:
+      return "Unknown murand status";
+  }
+}
+
+inline std::string build_musa_error_msg(murandStatus_t stat) {
+  std::ostringstream sout;
+  sout << "MURAND error: " << murandGetErrorString(stat) << ".";
+  return sout.str();
+}
 
 /*************** MUBLAS ERROR ***************/
 inline bool is_error(mublasStatus_t stat) {
   return stat != MUBLAS_STATUS_SUCCESS;
 }
 
+inline const char* mublasGetErrorString(mublasStatus_t stat) {
+  switch (stat) {
+    case MUBLAS_STATUS_SUCCESS:
+      return "MUBLAS_STATUS_SUCCESS";
+    case MUBLAS_STATUS_INVALID_HANDLE:
+      return "MUBLAS_STATUS_INVALID_HANDLE";
+    case MUBLAS_STATUS_NOT_IMPLEMENTED:
+      return "MUBLAS_STATUS_NOT_IMPLEMENTED";
+    case MUBLAS_STATUS_INVALID_POINTER:
+      return "MUBLAS_STATUS_INVALID_POINTER";
+    case MUBLAS_STATUS_INVALID_SIZE:
+      return "MUBLAS_STATUS_INVALID_SIZE";
+    case MUBLAS_STATUS_MEMORY_ERROR:
+      return "MUBLAS_STATUS_MEMORY_ERROR";
+    case MUBLAS_STATUS_INTERNAL_ERROR:
+      return "MUBLAS_STATUS_INTERNAL_ERROR";
+    case MUBLAS_STATUS_PERF_DEGRADED:
+      return "MUBLAS_STATUS_PERF_DEGRADED";
+    case MUBLAS_STATUS_SIZE_QUERY_MISMATCH:
+      return "MUBLAS_STATUS_SIZE_QUERY_MISMATCH";
+    case MUBLAS_STATUS_SIZE_INCREASED:
+      return "MUBLAS_STATUS_SIZE_INCREASED";
+    case MUBLAS_STATUS_SIZE_UNCHANGED:
+      return "MUBLAS_STATUS_SIZE_UNCHANGED";
+    case MUBLAS_STATUS_INVALID_VALUE:
+      return "MUBLAS_STATUS_INVALID_VALUE";
+    case MUBLAS_STATUS_CONTINUE:
+      return "MUBLAS_STATUS_CONTINUE";
+    default:
+      return "Unknown mublas status";
+  }
+}
 inline std::string build_musa_error_msg(mublasStatus_t stat) {
   std::ostringstream sout;
-  sout << "MUBLAS error(" << stat << "). ";
+  sout << "MUBLAS error: " << mublasGetErrorString(stat) << ".";
   return sout.str();
 }
 
-///*************** CUSPARSE ERROR ***************/
-//inline bool is_error(cusparseStatus_t stat) {
-//  return stat != CUSPARSE_STATUS_SUCCESS;
-//}
-//
-//inline std::string build_musa_error_msg(cusparseStatus_t stat) {
-//  std::ostringstream sout;
-//  sout << "CUSparse error(" << stat << "). " << GetExternalErrorMsg(stat);
-//  return sout.str();
-//}
+/*************** MUSPARSE ERROR ***************/
+inline bool is_error(musparseStatus_t stat) {
+  return stat != MUSPARSE_STATUS_SUCCESS;
+}
+
+inline const char* musparseGetErrorString(musparseStatus_t stat) {
+  switch (stat) {
+    case MUSPARSE_STATUS_SUCCESS:
+      return "MUSPARSE_STATUS_SUCCESSS";
+    case MUSPARSE_STATUS_INVALID_HANDLE:
+      return "MUSPARSE_STATUS_INVALID_HANDLE";
+    case MUSPARSE_STATUS_NOT_IMPLEMENTED:
+      return "MUSPARSE_STATUS_NOT_IMPLEMENTED";
+    case MUSPARSE_STATUS_INVALID_POINTER:
+      return "MUSPARSE_STATUS_INVALID_POINTER";
+    case MUSPARSE_STATUS_INVALID_SIZE:
+      return "MUSPARSE_STATUS_INVALID_SIZE";
+    case MUSPARSE_STATUS_MEMORY_ERROR:
+      return "MUSPARSE_STATUS_MEMORY_ERROR";
+    case MUSPARSE_STATUS_INTERNAL_ERROR:
+      return "MUSPARSE_STATUS_INTERNAL_ERROR";
+    case MUSPARSE_STATUS_INVALID_VALUE:
+      return "MUSPARSE_STATUS_INVALID_VALUE";
+    case MUSPARSE_STATUS_ARCH_MISMATCH:
+      return "MUSPARSE_STATUS_ARCH_MISMATCH";
+    case MUSPARSE_STATUS_ZERO_PIVOT:
+      return "MUSPARSE_STATUS_ZERO_PIVOT";
+    case MUSPARSE_STATUS_NOT_INITIALIZED:
+      return "MUSPARSE_STATUS_NOT_INITIALIZED";
+    case MUSPARSE_STATUS_TYPE_MISMATCH:
+      return "MUSPARSE_STATUS_TYPE_MISMATCH";
+    case MUSPARSE_STATUS_REQUIRES_SORTED_STORAGE:
+      return "MUSPARSE_STATUS_REQUIRES_SORTED_STORAGE";
+    default:
+      return "Unknown musparse status";
+  }
+}
+
+inline std::string build_musa_error_msg(musparseStatus_t stat) {
+  std::ostringstream sout;
+  sout << "MUSparse error: " << musparseGetErrorString(stat) << ".";
+  return sout.str();
+}
 
 /**************** MCCL ERROR ****************/
 #if !defined(__APPLE__) && defined(PADDLE_WITH_MCCL)
diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
index a24f2281dc40a..bcfd4aa1e6d3b 100644
--- a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
@@ -32,9 +32,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/backends/gpu/rocm/miopen_helper.h"
 #include "paddle/phi/kernels/gpudnn/conv_miopen_helper.h"
-#elif defined(PADDLE_WITH_MUSA)
-
-#else
+#elif defined(PADDLE_WITH_CUDA)
 #include "paddle/phi/backends/gpu/cuda/cudnn_helper.h"
 #include "paddle/phi/kernels/gpudnn/conv_cudnn_v7.h"
 #endif
@@ -169,7 +167,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
 
   int iwo_groups = groups;
   int c_groups = 1;
-#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
   iwo_groups = 1;
   c_groups = groups;
   groups = 1;
@@ -202,10 +200,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
 #ifdef PADDLE_WITH_HIP
   SearchResult<miopenConvFwdAlgorithm_t> fwd_result;
   SearchResult<miopenConvBwdWeightsAlgorithm_t> filter_result;
-#elif defined(PADDLE_WITH_MUSA)
-  SearchResult<mudnnConvFwdAlgorithm_t> fwd_result;
-  SearchResult<mudnnConvBwdWeightsAlgorithm_t> filter_result;
-#else
+#elif defined(PADDLE_WITH_CUDA)
   SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result;
   SearchResult<cudnnConvolutionBwdFilterAlgo_t> filter_result;
 #endif
diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
index e2df9836796e9..13348a3e44783 100644
--- a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
@@ -30,9 +30,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/backends/gpu/rocm/miopen_helper.h"
 #include "paddle/phi/kernels/gpudnn/conv_miopen_helper.h"
-#elif defined(PADDLE_WITH_MUSA)
-
-#else
+#elif defined(PADDLE_WITH_CUDA)
 #include "paddle/phi/backends/gpu/cuda/cudnn_helper.h"
 #include "paddle/phi/kernels/gpudnn/conv_cudnn_v7.h"
 #endif
@@ -178,7 +176,7 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
 
   int iwo_groups = groups;
   int c_groups = 1;
-#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) || defined(PADDLE_WTIH_MUSA)
+#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
   iwo_groups = 1;
   c_groups = groups;
   groups = 1;
@@ -193,9 +191,7 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
   size_t workspace_size = 0;
 #ifdef PADDLE_WITH_HIP
   miopenConvBwdDataAlgorithm_t algo{};
-#elif defined(PADDLE_WITH_MUSA)
-  mudnnConvBwdDataAlgorithm_t algo{};
-#else
+#elif defined(PADDLE_WITH_CUDA)
   cudnnConvolutionBwdDataAlgo_t algo{};
 #endif
   // ------------------- cudnn conv algorithm ---------------------
@@ -231,13 +227,7 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
   workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args));
   bwd_result.algo =
       search::Find<T>(args, false, deterministic, workspace_size, ctx);
-#elif defined(PADDLE_WITH_MUSA)
-  SearchResult<mudnnConvBwdDataAlgorithm_t> bwd_result;
-  using search = SearchAlgorithm<mudnnConvBwdDataAlgorithm_t>;
-  workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args));
-  bwd_result.algo =
-      search::Find<T>(args, false, deterministic, workspace_size, ctx);
-#else
+#elif defined(PADDLE_WITH_CUDA)
   SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result;
   using search = SearchAlgorithm<ConvKind::kBackwardData>;
   bwd_result = search::Find<T>(ctx, args, false, deterministic, false);
diff --git a/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu b/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu
index 7f27177a69788..dbf2f45bb9d9a 100644
--- a/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu
@@ -154,7 +154,7 @@ void PoolGradRawGPUDNNKernel(const Context& ctx,
     // input grad
     transformed_input_grad.Resize(make_ddim(in_dims_vec));
 
-#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#ifdef PADDLE_WITH_HIP
     // MIOPEN not support NHWC data layout
   } else if (data_format == str_NHWC) {
     layout = GPUDNNDataLayout::kNCHW;
@@ -217,12 +217,7 @@ void PoolGradRawGPUDNNKernel(const Context& ctx,
       layout, vectorize<int>(transformed_input.dims()));
   miopenTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
       layout, vectorize<int>(transformed_output.dims()));
-#elif defined(PADDLE_WITH_MUSA)
-  mudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
-      layout, vectorize<int>(transformed_input.dims()));
-  mudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
-      layout, vectorize<int>(transformed_output.dims()));
-#else
+#elif defined(PADDLE_WITH_CUDA)
   cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
       layout, vectorize<int>(transformed_input.dims()));
   cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
@@ -243,10 +238,7 @@ void PoolGradRawGPUDNNKernel(const Context& ctx,
 #ifdef PADDLE_WITH_HIP
   miopenPoolingDescriptor_t cudnn_pool_desc =
       pool_desc.descriptor(pooling_mode, kernel_size_, paddings_, strides);
-#elif defined(PADDLE_WITH_MUSA)
-  mudnnPoolingDescriptor_t cudnn_pool_desc =
-      pool_desc.descriptor(pooling_mode, kernel_size_, paddings_, strides);
-#else
+#elif defined(PADDLE_WITH_CUDA)
   cudnnPoolingDescriptor_t cudnn_pool_desc =
       pool_desc.descriptor(pooling_mode, kernel_size_, paddings_, strides);
 #endif
@@ -277,9 +269,7 @@ void PoolGradRawGPUDNNKernel(const Context& ctx,
                                                               input_grad_data,
                                                               pool_workspace));
     PADDLE_ENFORCE_GPU_SUCCESS(hipFree(pool_workspace));
-#elif defined(PADDLE_WITH_MUSA)
-
-#else
+#elif defined(PADDLE_WITH_CUDA)
     PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnPoolingBackward(handle,
                                                              cudnn_pool_desc,
                                                              &alpha,
@@ -299,7 +289,7 @@ void PoolGradRawGPUDNNKernel(const Context& ctx,
       funcs::Transpose<Context, T, 5> trans5_v4;
       trans5_v4(ctx, transformed_input_grad, input_grad, axis);
     }
-#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#ifdef PADDLE_WITH_HIP
     // MIOPEN not support NHWC data layout
     if (data_format == str_NHWC) {
       std::vector<int> axis{0, 2, 3, 1};
diff --git a/paddle/phi/kernels/gpudnn/pool_kernel.cu b/paddle/phi/kernels/gpudnn/pool_kernel.cu
index 3e472ff23dab4..caeecbaca9734 100644
--- a/paddle/phi/kernels/gpudnn/pool_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/pool_kernel.cu
@@ -111,8 +111,8 @@ void PoolRawGPUDNNKernel(const Context& ctx,
     out_dims_vec[3] = output->dims()[2];
     out_dims_vec[4] = output->dims()[3];
     transformed_output.Resize(make_ddim(out_dims_vec));
-#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
-    // MIOPEN and MUDNN not support NHWC data layout
+#if defined(PADDLE_WITH_HIP)
+    // MIOPEN not support NHWC data layout
   } else if (data_format == str_NHWC) {
     layout = GPUDNNDataLayout::kNCHW;
 
@@ -155,12 +155,7 @@ void PoolRawGPUDNNKernel(const Context& ctx,
       layout, vectorize<int>(transformed_input.dims()));
   miopenTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
       layout, vectorize<int>(transformed_output.dims()));
-#elif defined(PADDLE_WITH_MUSA)
-  mudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
-      layout, vectorize<int>(transformed_input.dims()));
-  mudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
-      layout, vectorize<int>(transformed_output.dims()));
-#else
+#elif defined(PADDLE_WITH_CUDA)
   cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
       layout, vectorize<int>(transformed_input.dims()));
   cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
@@ -177,10 +172,7 @@ void PoolRawGPUDNNKernel(const Context& ctx,
 #ifdef PADDLE_WITH_HIP
   miopenPoolingDescriptor_t cudnn_pool_desc =
       pool_desc.descriptor(pooling_mode, kernel_size_, paddings_, strides);
-#elif defined(PADDLE_WITH_MUSA)
-  mudnnPoolingDescriptor_t cudnn_pool_desc =
-      pool_desc.descriptor(pooling_mode, kernel_size_, paddings_, strides);
-#else
+#elif defined(PADDLE_WITH_CUDA)
   cudnnPoolingDescriptor_t cudnn_pool_desc =
       pool_desc.descriptor(pooling_mode, kernel_size_, paddings_, strides);
 #endif
@@ -208,17 +200,7 @@ void PoolRawGPUDNNKernel(const Context& ctx,
                                     pool_workspace,
                                     pool_workernel_size_));
   PADDLE_ENFORCE_GPU_SUCCESS(hipFree(pool_workspace));
-#elif defined(PADDLE_WITH_MUSA)
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      dynload::mudnnPoolingForward(handle,
-                                   cudnn_pool_desc,
-                                   &alpha,
-                                   cudnn_input_desc,
-                                   tranformed_input_data,
-                                   &beta,
-                                   cudnn_output_desc,
-                                   tranformed_output_data));
-#else
+#elif defined(PADDLE_WITH_CUDA)
   PADDLE_ENFORCE_GPU_SUCCESS(
       dynload::cudnnPoolingForward(handle,
                                    cudnn_pool_desc,
@@ -235,7 +217,7 @@ void PoolRawGPUDNNKernel(const Context& ctx,
     funcs::Transpose<Context, T, 5> trans5_v2;
     trans5_v2(ctx, transformed_output, output, axis);
   }
-#if defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
   // MIOPEN not support NHWC data layout
   if (data_format == str_NHWC) {
     std::vector<int> axis{0, 2, 3, 1};
diff --git a/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu
index fc526adeacec5..12031bbaa12a1 100644
--- a/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu
@@ -86,8 +86,8 @@ void MatmulCooDenseGradKernel(const Context& dev_ctx,
                                  "ROCM 4.3.0"));
 #elif defined(PADDLE_WITH_MUSA)
   PADDLE_THROW(phi::errors::Unimplemented(
-      "backward of 'sparse.matmul' use cusparseSDDMM, which is supported from "
-      "MUSA xxx"));
+      "backward of 'sparse.matmul' use musparseSDDMM, which is supported from "
+      "MUSA"));
 #endif
 #endif
 }
@@ -141,8 +141,8 @@ void MatmulCsrDenseGradKernel(const Context& dev_ctx,
                                  "ROCM 4.3.0"));
 #elif defined(PADDLE_WITH_MUSA)
   PADDLE_THROW(phi::errors::Unimplemented(
-      "backward of 'sparse.matmul' use cusparseSDDMM, which is supported from "
-      "MUSA xxx"));
+      "backward of 'sparse.matmul' use musparseSDDMM, which is supported from "
+      "MUSA"));
 #endif
 #endif
 }

From 5a56ea0bfd58dd7d81aaa39db98ff697b637549f Mon Sep 17 00:00:00 2001
From: CaiZhi <zhi.cai@mthreads.com>
Date: Tue, 15 Aug 2023 01:21:00 +0800
Subject: [PATCH 47/55] [MTAI-484] fix(build): modify format for MUSA

---
 paddle/phi/backends/gpu/gpu_resources.cc      |  38 +--
 paddle/phi/kernels/funcs/math_function.cu     |  38 +--
 paddle/phi/kernels/funcs/softmax.cu           |   4 +-
 .../phi/kernels/gpu/batch_norm_grad_kernel.cu |  45 +--
 paddle/phi/kernels/gpu/batch_norm_kernel.cu   |  14 +-
 .../kernels/gpu/instance_norm_grad_kernel.cu  |  35 +-
 paddle/phi/kernels/gpu/rnn_functor.h          |  43 +--
 paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc  |  22 +-
 paddle/phi/kernels/gpu/rnn_kernel.cu.cc       |  28 +-
 paddle/phi/kernels/gpudnn/conv_grad_kernel.cu |  16 +-
 paddle/phi/kernels/gpudnn/conv_kernel.cu      |   9 +-
 .../gpudnn/conv_transpose_grad_kernel.cu      |   9 +-
 paddle/phi/kernels/impl/matmul_kernel_impl.h  | 319 +++++++++---------
 13 files changed, 227 insertions(+), 393 deletions(-)

diff --git a/paddle/phi/backends/gpu/gpu_resources.cc b/paddle/phi/backends/gpu/gpu_resources.cc
index 01c307bd9e29e..98ea78a3e5109 100644
--- a/paddle/phi/backends/gpu/gpu_resources.cc
+++ b/paddle/phi/backends/gpu/gpu_resources.cc
@@ -155,9 +155,10 @@ void InitGpuProperties(Place place,
            "version.";
   }
 #elif defined(PADDLE_WITH_MUSA)
-  // TODO(@caizhi): enable dynload module
+  // TODO(@caizhi): mudnnGetVersion is not supported for MUSA now.
+  // Requests have been submitted to Mudnn.
   // size_t mudnn_dso_ver = dynload::mudnnGetVersion();
-  size_t mudnn_dso_ver = 0;
+  size_t mudnn_dso_ver = 1100;
   LOG_FIRST_N(WARNING, 1) << "device: " << static_cast<int>(place.device)
                           << ", muDNN Version: " << mudnn_dso_ver / 1000 << "."
                           << (mudnn_dso_ver % 1000) / 100 << ".";
@@ -168,21 +169,20 @@ void InitGpuProperties(Place place,
   auto compile_musa_version =
       (MUSA_VERSION / 1000) * 10 + (MUSA_VERSION % 100) / 10;
 #if defined(__linux__)
-  // TODO(@caizhi): enable dynload module
-  //PADDLE_ENFORCE_EQ(
-  //    (local_musa_version / 10 < compile_musa_version / 10) &&
-  //        (mudnn_dso_ver / 1000 < MUDNN_VERSION / 1000),
-  //    false,
-  //    phi::errors::InvalidArgument(
-  //        "The installed Paddle is compiled with MUDA%d/muDNN%d,"
-  //        "but MUSA/muDNN version in your machine is MUSA%d/muDNN%d. "
-  //        "which will cause serious incompatible bug. "
-  //        "Please recompile or reinstall Paddle with compatible MUSA/muDNN "
-  //        "version.",
-  //        compile_musa_version / 10,
-  //        MUDNN_VERSION / 1000,
-  //        local_musa_version / 10,
-  //        mudnn_dso_ver / 1000));
+  PADDLE_ENFORCE_EQ(
+      (local_musa_version / 10 < compile_musa_version / 10) &&
+          (mudnn_dso_ver / 1000 < MUDNN_VERSION / 1000),
+      false,
+      phi::errors::InvalidArgument(
+          "The installed Paddle is compiled with MUDA%d/muDNN%d,"
+          "but MUSA/muDNN version in your machine is MUSA%d/muDNN%d. "
+          "which will cause serious incompatible bug. "
+          "Please recompile or reinstall Paddle with compatible MUSA/muDNN "
+          "version.",
+          compile_musa_version / 10,
+          MUDNN_VERSION / 1000,
+          local_musa_version / 10,
+          mudnn_dso_ver / 1000));
 #endif
   if (local_musa_version < compile_musa_version) {
     LOG_FIRST_N(WARNING, 1)
@@ -335,9 +335,7 @@ void InitDnnHandle(dnnHandle_t* handle, gpuStream_t stream, Place place) {
     }
     PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreate(handle));
     PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetStream(*handle, stream));
-#elif defined(PADDLE_WITH_MUSA)
-
-#else
+#elif defined(PADDLE_WITH_CUDA)
     auto local_cudnn_version = phi::dynload::cudnnGetVersion() / 100;
     auto compile_cudnn_version = CUDNN_VERSION / 100;
     if (local_cudnn_version < static_cast<size_t>(compile_cudnn_version)) {
diff --git a/paddle/phi/kernels/funcs/math_function.cu b/paddle/phi/kernels/funcs/math_function.cu
index 8dac85bb3c585..5b9fecbd43b30 100644
--- a/paddle/phi/kernels/funcs/math_function.cu
+++ b/paddle/phi/kernels/funcs/math_function.cu
@@ -430,16 +430,15 @@ void ColwiseSum<phi::GPUContext, double>::operator()(
 
   SetConstant<phi::GPUContext, double> set;
   set(context, &one, static_cast<double>(1.0));
-  // TODO(@caizhi): enable blas modules
-  //phi::funcs::GetBlas<phi::GPUContext, double>(context).GEMV(
-  //    true,
-  //    static_cast<int>(in_dims[0]),
-  //    static_cast<int>(in_dims[1]),
-  //    1.0,
-  //    input.data<double>(),
-  //    one.data<double>(),
-  //    0.0,
-  //    vector->data<double>());
+  phi::funcs::GetBlas<phi::GPUContext, double>(context).GEMV(
+      true,
+      static_cast<int>(in_dims[0]),
+      static_cast<int>(in_dims[1]),
+      1.0,
+      input.data<double>(),
+      one.data<double>(),
+      0.0,
+      vector->data<double>());
 }
 
 template struct RowwiseSum<phi::GPUContext, float>;
@@ -469,16 +468,15 @@ void RowwiseSum<phi::GPUContext, double>::operator()(
 
   SetConstant<phi::GPUContext, double> set;
   set(context, &one, static_cast<double>(1.0));
-  // TODO(@caizhi): enable blas modules
-  //phi::funcs::GetBlas<phi::GPUContext, double>(context).GEMV(
-  //    true,
-  //    static_cast<int>(in_dims[1]),
-  //    static_cast<int>(in_dims[0]),
-  //    1.0,
-  //    one.data<double>(),
-  //    input.data<double>(),
-  //    0.0,
-  //    vector->data<double>());
+  phi::funcs::GetBlas<phi::GPUContext, double>(context).GEMV(
+      true,
+      static_cast<int>(in_dims[1]),
+      static_cast<int>(in_dims[0]),
+      1.0,
+      one.data<double>(),
+      input.data<double>(),
+      0.0,
+      vector->data<double>());
 }
 
 template struct RowwiseMean<phi::GPUContext, float>;
diff --git a/paddle/phi/kernels/funcs/softmax.cu b/paddle/phi/kernels/funcs/softmax.cu
index 11ee9c23fa2ff..fb76ab3017179 100644
--- a/paddle/phi/kernels/funcs/softmax.cu
+++ b/paddle/phi/kernels/funcs/softmax.cu
@@ -21,6 +21,7 @@ limitations under the License. */
 
 namespace phi {
 namespace funcs {
+
 using ScopedTensorDescriptor = phi::backends::gpu::ScopedTensorDescriptor;
 using DataLayout = phi::backends::gpu::DataLayout;
 template <typename T>
@@ -117,8 +118,6 @@ void SoftmaxGradCUDNNFunctor<T, DeviceContext>::operator()(
                                              MIOPEN_SOFTMAX_ACCURATE,
                                              MIOPEN_SOFTMAX_MODE_INSTANCE));
 #elif defined(PADDLE_WITH_MUSA)
-  // TODO
-#else
   cudnnTensorDescriptor_t cudnn_y_desc =
       yDesc.descriptor<T>(layout, cudnn_tensor_dims);
   cudnnTensorDescriptor_t cudnn_xgrad_desc =
@@ -154,6 +153,7 @@ template class SoftmaxGradCUDNNFunctor<phi::dtype::bfloat16, phi::GPUContext>;
 template class SoftmaxCUDNNFunctor<double, phi::GPUContext>;
 template class SoftmaxGradCUDNNFunctor<double, phi::GPUContext>;
 #endif
+
 template class SoftmaxFunctor<phi::GPUContext, phi::dtype::float16>;
 template class SoftmaxFunctor<phi::GPUContext, phi::dtype::bfloat16>;
 template class SoftmaxFunctor<phi::GPUContext, float>;
diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
index 19930eb28add5..7dc5194e2c150 100644
--- a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
@@ -568,7 +568,7 @@ void BatchNormGradRawKernel(const Context &ctx,
           scale.dims()[0]));
 
   auto dtype = phi::backends::gpu::CudnnDataType<T>::type;
-#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#ifdef PADDLE_WITH_HIP
   auto compute_format =
       data_layout == DataLayout::kNHWC ? DataLayout::kNHWC : DataLayout::kNCHW;
 
@@ -650,8 +650,7 @@ void BatchNormGradRawKernel(const Context &ctx,
 //     platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
 // PADDLE_ENFORCE_GPU_SUCCESS(
 //     platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
-#elif defined(PADDLE_WITH_MUSA)
-#else
+#elif defined(PADDLE_WITH_CUDA)
     cudnnTensorDescriptor_t data_desc_;
     cudnnTensorDescriptor_t bn_param_desc_;
     cudnnBatchNormMode_t mode_;
@@ -697,16 +696,7 @@ void BatchNormGradRawKernel(const Context &ctx,
 // PADDLE_ENFORCE_GPU_SUCCESS(
 //     platform::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_,
 //                                                       data_desc_, mode_));
-#elif defined(PADDLE_WITH_MUSA)
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnSetTensorNdDescriptor(
-        data_desc_,
-        CudnnDataType<T>::type,
-        x_dims.size() > 3 ? x_dims.size() : 4,
-        dims.data(),
-        strides.data()));
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnDeriveBNTensorDescriptor(
-        bn_param_desc_, data_desc_, mode_));
-#else
+#elif defined(PADDLE_WITH_CUDA)
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensorNdDescriptor(
         data_desc_,
         CudnnDataType<T>::type,
@@ -789,9 +779,7 @@ void BatchNormGradRawKernel(const Context &ctx,
 //         d_bias->template mutable_data<BatchNormParamType<T>>(
 //             ctx.GetPlace()),
 //         epsilon, saved_mean_data, saved_var_data));
-#elif defined(PADDLE_WITH_MUSA)
-
-#else
+#elif defined(PADDLE_WITH_CUDA)
     }
     // CUDNN only support small batch size
     bool use_native_nhwc =
@@ -1127,12 +1115,7 @@ void BatchNormGradRawKernel(const Context &ctx,
 //     platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
 // PADDLE_ENFORCE_GPU_SUCCESS(
 //     platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
-#elif defined(PADDLE_WITH_MUSA)
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::mudnnDestroyTensorDescriptor(data_desc_));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::mudnnDestroyTensorDescriptor(bn_param_desc_));
-#else
+#elif defined(PADDLE_WITH_CUDA)
     // clean when exit.
     PADDLE_ENFORCE_GPU_SUCCESS(
         phi::dynload::cudnnDestroyTensorDescriptor(data_desc_));
@@ -1392,21 +1375,7 @@ PD_REGISTER_KERNEL(batch_norm_grad_raw,
                    phi::BatchNormGradRawKernel,
                    float,
                    phi::dtype::float16) {}
-#elif defined(PADDLE_WITH_MUSA)
-PD_REGISTER_KERNEL(batch_norm_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::BatchNormGradKernel,
-                   float,
-                   phi::dtype::float16) {}
-
-PD_REGISTER_KERNEL(batch_norm_grad_raw,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::BatchNormGradRawKernel,
-                   float,
-                   phi::dtype::float16) {}
-#else
+#elif defined(PADDLE_WITH_CUDA)
 #if CUDNN_VERSION_MIN(8, 1, 0)
 
 PD_REGISTER_KERNEL(batch_norm_grad,
@@ -1440,7 +1409,7 @@ PD_REGISTER_KERNEL(batch_norm_grad_raw,
     kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);  // bias_grad
   }
 }
-#else // CUDA & MUSA
+#else // CUDA 
 PD_REGISTER_KERNEL(batch_norm_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
index d6a1fb761719e..c3750162637b0 100644
--- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -554,7 +554,7 @@ void BatchNormKernel(const Context &ctx,
 
   auto dtype = phi::backends::gpu::CudnnDataType<T>::type;
 
-#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#ifdef PADDLE_WITH_HIP
   auto compute_format =
       data_layout == DataLayout::kNHWC ? DataLayout::kNHWC : DataLayout::kNCHW;
 
@@ -597,6 +597,7 @@ void BatchNormKernel(const Context &ctx,
 // PADDLE_ENFORCE_GPU_SUCCESS(
 //     platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
 #elif defined(PADDLE_WITH_MUSA)
+
 #else
   cudnnTensorDescriptor_t data_desc_;
   cudnnTensorDescriptor_t bn_param_desc_;
@@ -615,9 +616,11 @@ void BatchNormKernel(const Context &ctx,
   }
   epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
 
-#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) 
+#ifdef PADDLE_WITH_HIP
 // TODO(wangran16): wait for MIOpen to improve the performance of BN
 // mode_ = miopenBNSpatial;
+#elif defined(PADDLE_WITH_MUSA)
+
 #elif CUDNN_VERSION_MIN(7, 0, 1)
   if (FLAGS_cudnn_batchnorm_spatial_persistent) {
     mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
@@ -1210,12 +1213,7 @@ void BatchNormKernel(const Context &ctx,
 //     platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
 // PADDLE_ENFORCE_GPU_SUCCESS(
 //     platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
-#elif defined(PADDLE_WITH_MUSA)
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::mudnnDestroyTensorDescriptor(data_desc_));
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::mudnnDestroyTensorDescriptor(bn_param_desc_));
-#else
+#elif defined(PADDLE_WITH_CUDA)
   // clean when exit.
   PADDLE_ENFORCE_GPU_SUCCESS(
       phi::dynload::cudnnDestroyTensorDescriptor(data_desc_));
diff --git a/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
index 3c8e4274474e1..5e48ed565f353 100644
--- a/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
@@ -401,15 +401,7 @@ void InstanceNormGradKernel(const Context &dev_ctx,
       phi::dynload::miopenCreateTensorDescriptor(&data_desc_));
   PADDLE_ENFORCE_GPU_SUCCESS(
       phi::dynload::miopenCreateTensorDescriptor(&in_param_desc_));
-#elif defined(PADDLE_WITH_MUSA)
-  mudnnTensorDescriptor_t data_desc_;
-  mudnnTensorDescriptor_t in_param_desc_;
-
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::mudnnCreateTensorDescriptor(&data_desc_));
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::mudnnCreateTensorDescriptor(&in_param_desc_));
-#else
+#elif defined(PADDLE_WITH_CUDA)
   cudnnTensorDescriptor_t data_desc_;
   cudnnTensorDescriptor_t in_param_desc_;
 
@@ -435,16 +427,7 @@ void InstanceNormGradKernel(const Context &dev_ctx,
       const_cast<int *>(strides.data())));
   PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDeriveBNTensorDescriptor(
       in_param_desc_, data_desc_, miopenBNSpatial));
-#elif defined(PADDLE_WITH_MUSA)
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnSetTensorDescriptor(
-      data_desc_,
-      CudnnDataType<T>::type,
-      x_dims.size() > 3 ? x_dims.size() : 4,
-      const_cast<int *>(dims.data()),
-      const_cast<int *>(strides.data())));
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnDeriveBNTensorDescriptor(
-      in_param_desc_, data_desc_, miopenBNSpatial));
-#else
+#elif defined(PADDLE_WITH_CUDA)
   PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensorNdDescriptor(
       data_desc_,
       CudnnDataType<T>::type,
@@ -481,14 +464,9 @@ void InstanceNormGradKernel(const Context &dev_ctx,
         epsilon,
         saved_mean_data,
         saved_var_data));
-#else
-#ifdef PADDLE_WITH_MUSA
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnBatchNormalizationBackward(
-        dev_ctx.mudnn_handle(),
-#else
+#elif defined(PADDLE_WITH_CUDA)
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnBatchNormalizationBackward(
         dev_ctx.cudnn_handle(),
-#endif
         CUDNN_BATCHNORM_SPATIAL,
         CudnnDataType<T>::kOne(),
         CudnnDataType<T>::kZero(),
@@ -533,12 +511,7 @@ void InstanceNormGradKernel(const Context &dev_ctx,
       phi::dynload::miopenDestroyTensorDescriptor(data_desc_));
   PADDLE_ENFORCE_GPU_SUCCESS(
       phi::dynload::miopenDestroyTensorDescriptor(in_param_desc_));
-#elif defined(PADDLE_WITH_MUSA)
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::mudnnDestroyTensorDescriptor(data_desc_));
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::mudnnDestroyTensorDescriptor(in_param_desc_));
-#else
+#elif defined(PADDLE_WITH_CUDA)
   PADDLE_ENFORCE_GPU_SUCCESS(
       phi::dynload::cudnnDestroyTensorDescriptor(data_desc_));
   PADDLE_ENFORCE_GPU_SUCCESS(
diff --git a/paddle/phi/kernels/gpu/rnn_functor.h b/paddle/phi/kernels/gpu/rnn_functor.h
index e351c29138ee8..3bf38383e9bda 100644
--- a/paddle/phi/kernels/gpu/rnn_functor.h
+++ b/paddle/phi/kernels/gpu/rnn_functor.h
@@ -107,10 +107,7 @@ class RNNDescriptors {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
           phi::dynload::miopenDropoutGetStatesSize(handle, &state_size));
-#elif defined(PADDLE_WITH_MUSA)
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::mudnnDropoutGetStatesSize(handle, &state_size));
-#else
+#elif defined(PADDLE_WITH_CUDA)
       PADDLE_ENFORCE_GPU_SUCCESS(
           phi::dynload::cudnnDropoutGetStatesSize(handle, &state_size));
 #endif
@@ -150,10 +147,7 @@ class RNNDescriptors {
         mode_,
         CUDNN_RNN_ALGO_STANDARD,
         cudnn_type));
-#else
-#ifdef PADDLE_WITH_MUSA
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnSetRNNDescriptor(
-#else
+#elif defined(PADDLE_WITH_CUDA)
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetRNNDescriptor(
 #endif
         rnn_desc_.desc(),
@@ -178,10 +172,7 @@ class RNNDescriptors {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenGetRNNParamsSize(
         handle, rnn_desc_.desc(), x_descs_[0], &weights_size_, cudnn_type));
-#elif defined(PADDLE_WITH_MUSA)
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnGetRNNParamsSize(
-        handle, rnn_desc_.desc(), x_descs_[0], &weights_size_, cudnn_type));
-#else
+#elif defined(PADDLE_WITH_CUDA)
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnGetRNNParamsSize(
         handle, rnn_desc_.desc(), x_descs_[0], &weights_size_, cudnn_type));
 #endif
@@ -205,16 +196,7 @@ class RNNDescriptors {
                                                 workspace_size));
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenGetRNNTrainingReserveSize(
         handle, rnn_desc_.desc(), seq_length_, x_descs_.data(), reserve_size));
-#elif defined(PADDLE_WITH_MUSA)
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::mudnnGetRNNWorkspaceSize(handle,
-                                               rnn_desc_.desc(),
-                                               seq_length_,
-                                               x_descs_.data(),
-                                               workspace_size));
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnGetRNNTrainingReserveSize(
-        handle, rnn_desc_.desc(), seq_length_, x_descs_.data(), reserve_size));
-#else
+#elif defined(PADDLE_WITH_CUDA)
     PADDLE_ENFORCE_GPU_SUCCESS(
         phi::dynload::cudnnGetRNNWorkspaceSize(handle,
                                                rnn_desc_.desc(),
@@ -235,17 +217,7 @@ class RNNDescriptors {
   miopenRNNDescriptor_t rnn_desc() { return rnn_desc_.desc(); }
   miopenDropoutDescriptor_t dropout_desc() { return dropout_desc_.desc(); }
   miopenTensorDescriptor_t weight_desc() { return weight_desc_.desc(); }
-#elif defined(PADDLE_WITH_MUSA)
-  mudnnTensorDescriptor_t *x_descs() { return x_descs_.data(); }
-  mudnnTensorDescriptor_t *y_descs() { return y_descs_.data(); }
-  mudnnTensorDescriptor_t init_h_desc() { return init_h_desc_.desc(); }
-  mudnnTensorDescriptor_t init_c_desc() { return init_c_desc_.desc(); }
-  mudnnTensorDescriptor_t last_h_desc() { return last_h_desc_.desc(); }
-  mudnnTensorDescriptor_t last_c_desc() { return last_c_desc_.desc(); }
-  mudnnRNNDescriptor_t rnn_desc() { return rnn_desc_.desc(); }
-  mudnnDropoutDescriptor_t dropout_desc() { return dropout_desc_.desc(); }
-  mudnnTensorDescriptor_t weight_desc() { return weight_desc_.desc(); }
-#else
+#elif defined(PADDLE_WITH_CUDA)
   cudnnTensorDescriptor_t *x_descs() { return x_descs_.data(); }
   cudnnTensorDescriptor_t *y_descs() { return y_descs_.data(); }
 #if CUDNN_VERSION >= 7201
@@ -276,10 +248,7 @@ class RNNDescriptors {
 #ifdef PADDLE_WITH_HIP
   std::vector<miopenTensorDescriptor_t> x_descs_;
   std::vector<miopenTensorDescriptor_t> y_descs_;
-#elif defined(PADDLE_WITH_HIP)
-  std::vector<mudnnTensorDescriptor_t> x_descs_;
-  std::vector<mudnnTensorDescriptor_t> y_descs_;
-#else
+#elif defined(PADDLE_WITH_CUDA)
   std::vector<cudnnTensorDescriptor_t> x_descs_;
   std::vector<cudnnTensorDescriptor_t> y_descs_;
 #endif
diff --git a/paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc b/paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc
index 44bca2124770a..5bd1af8f789e5 100644
--- a/paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc
+++ b/paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc
@@ -105,17 +105,7 @@ void RnnGradKernel(const Context &dev_ctx,
     rnn_mode = miopenRNNRELU;
   else if (mode == "RNN_TANH")
     rnn_mode = miopenRNNTANH;
-#elif defined(PADDLE_WITH_MUSA)
-  mudnnRNNMode_t rnn_mode = MUDNN_LSTM;
-  if (mode == "LSTM")
-    rnn_mode = MUDNN_LSTM;
-  else if (mode == "GRU")
-    rnn_mode = MUDNN_GRU;
-  else if (mode == "RNN_RELU")
-    rnn_mode = MUDNN_RNN_RELU;
-  else if (mode == "RNN_TANH")
-    rnn_mode = MUDNN_RNN_TANH;
-#else
+#elif defined(PADDLE_WITH_CUDA)
   cudnnRNNMode_t rnn_mode = CUDNN_LSTM;
   if (mode == "LSTM")
     rnn_mode = CUDNN_LSTM;
@@ -205,9 +195,7 @@ void RnnGradKernel(const Context &dev_ctx,
   T *init_c_grad_data = nullptr;
 #ifdef PADDLE_WITH_HIP
   if (rnn_mode == miopenLSTM) {
-#elif defined(PADDLE_WITH_MUSA)
-  if (rnn_mode == MUDNN_LSTM)
-#else
+#elif defined(PADDLE_WITH_CUDA)
   if (rnn_mode == CUDNN_LSTM) {
 #endif
     init_c_data = pre_state[1]->data<T>();
@@ -353,12 +341,8 @@ void RnnGradKernel(const Context &dev_ctx,
       // permute weight grad list from weight grad tensor
       TensorToPermutedWeight<T>(
           place, stream, weight_grad, &weight_grad_list, rnn_mode, is_bidirec);
-#else
-#ifdef PADDLE_WITH_MUSA
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnRNNBackwardWeights(
-#else
+#elif defined(PADDLE_WITH_CUDA)
       PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNBackwardWeights(
-#endif
           handle,
           rnn.rnn_desc(),
           seq_length,
diff --git a/paddle/phi/kernels/gpu/rnn_kernel.cu.cc b/paddle/phi/kernels/gpu/rnn_kernel.cu.cc
index 601c1a524c402..eb2a1b9d4bce7 100644
--- a/paddle/phi/kernels/gpu/rnn_kernel.cu.cc
+++ b/paddle/phi/kernels/gpu/rnn_kernel.cu.cc
@@ -63,13 +63,9 @@ void RNNInferece(bool has_seq_length,
                                                 last_c_data,
                                                 workspace_data->data<uint8_t>(),
                                                 workspace_size));
-#else
+#elif defined(PADDLE_WITH_CUDA)
     PADDLE_ENFORCE_GPU_SUCCESS(
-#ifdef PADDLE_WITH_MUSA
-        phi::dynload::mudnnRNNForwardInference(handle,
-#else
         phi::dynload::cudnnRNNForwardInference(handle,
-#endif
                                                rnn->rnn_desc(),
                                                seq_length,
                                                rnn->x_descs(),
@@ -158,17 +154,7 @@ void RnnKernel(const Context &dev_ctx,
     rnn_mode = miopenRNNRELU;
   else if (mode == "RNN_TANH")
     rnn_mode = miopenRNNTANH;
-#elif defined(PADDLE_WITH_MUSA)
-  gpuRNNMode_t rnn_mode = MUDNN_LSTM;
-  if (mode == "LSTM")
-    rnn_mode = MUDNN_LSTM;
-  else if (mode == "GRU")
-    rnn_mode = MUDNN_GRU;
-  else if (mode == "RNN_RELU")
-    rnn_mode = MUDNN_RNN_RELU;
-  else if (mode == "RNN_TANH")
-    rnn_mode = MUDNN_RNN_TANH;
-#else
+#elif defined(PADDLE_WITH_CUDA)
   gpuRNNMode_t rnn_mode = CUDNN_LSTM;
   if (mode == "LSTM")
     rnn_mode = CUDNN_LSTM;
@@ -202,9 +188,7 @@ void RnnKernel(const Context &dev_ctx,
   T *last_c_data = nullptr;
 #ifdef PADDLE_WITH_HIP
   if (rnn_mode == miopenLSTM) {
-#elif defined(PADDLE_WITH_MUSA)
-  if (rnn_mode == MUDNN_LSTM) {
-#else
+#elif defined(PADDLE_WITH_CUDA)
   if (rnn_mode == CUDNN_LSTM) {
 #endif
     init_c_data = pre_state[1]->data<T>();
@@ -347,13 +331,9 @@ void RnnKernel(const Context &dev_ctx,
           workspace_size,
           reserve_data,
           reserve_size));
-#else
+#elif defined(PADDLE_WITH_CUDA)
       PADDLE_ENFORCE_GPU_SUCCESS(
-#ifdef PADDLE_WITH_MUSA
-          phi::dynload::mudnnRNNForwardTraining(handle,
-#else
           phi::dynload::cudnnRNNForwardTraining(handle,
-#endif
                                                 rnn.rnn_desc(),
                                                 seq_length,
                                                 rnn.x_descs(),
diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
index 7dde591c95042..681af7a28d1d8 100644
--- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
@@ -140,10 +140,7 @@ void ConvCudnnGradKernelImplV7(
 #ifdef PADDLE_WITH_HIP
   SearchResult<miopenConvBwdDataAlgorithm_t> bwd_result;
   SearchResult<miopenConvBwdWeightsAlgorithm_t> filter_result;
-#elif defined(PADDLE_WITH_MUSA)
-  SearchResult<mudnnConvBwdDataAlgorithm_t> bwd_result;
-  SearchResult<mudnnConvBwdWeightsAlgorithm_t> filter_result;
-#else
+#elif defined(PADDLE_WITH_CUDA)
   SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result;
   SearchResult<cudnnConvolutionBwdFilterAlgo_t> filter_result;
 #endif
@@ -151,7 +148,7 @@ void ConvCudnnGradKernelImplV7(
   int iwo_groups = groups;
   int c_groups = 1;
 
-#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
   iwo_groups = 1;
   c_groups = groups;
   groups = 1;
@@ -1009,7 +1006,7 @@ void ConvCudnnGradGradKernel(
 
   int iwo_group = groups;
   int c_group = 1;
-#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
   iwo_group = 1;
   c_group = groups;
   groups = 1;
@@ -1066,12 +1063,7 @@ void ConvCudnnGradGradKernel(
   SearchResult<miopenConvFwdAlgorithm_t> fwd_result2;
   SearchResult<miopenConvBwdDataAlgorithm_t> data_result;
   SearchResult<miopenConvBwdWeightsAlgorithm_t> filter_result;
-#elif defined(PADDLE_WITH_MUSA)
-  SearchResult<mudnnConvFwdAlgorithm_t> fwd_result1;
-  SearchResult<mudnnConvFwdAlgorithm_t> fwd_result2;
-  SearchResult<mudnnConvBwdDataAlgorithm_t> data_result;
-  SearchResult<mudnnConvBwdWeightsAlgorithm_t> filter_result;
-#else
+#elif defined(PADDLE_WITH_CUDA)
   SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result1;
   SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result2;
   SearchResult<cudnnConvolutionBwdDataAlgo_t> data_result;
diff --git a/paddle/phi/kernels/gpudnn/conv_kernel.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu
index 2193050bdc909..a78b61690259e 100644
--- a/paddle/phi/kernels/gpudnn/conv_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_kernel.cu
@@ -153,12 +153,7 @@ void ConvCudnnKernelImplV7(const DenseTensor* transformed_input,
   workspace_size = search::GetWorkspaceSize(args);
   fwd_result.algo = search::Find<T>(
       args, exhaustive_search, deterministic, workspace_size, ctx);
-#elif defined(PADDLE_WITH_MUSA)
-  SearchResult<mudnnConvolutionFwdAlgo_t> fwd_result;
-  using search = SearchAlgorithm<ConvKind::kForward>;
-  fwd_result = search::Find<T>(ctx, args, exhaustive_search, deterministic);
-  workspace_size = fwd_result.workspace_size;
-#else
+#elif defined(PADDLE_WITH_CUDA)
   SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result;
   using search = SearchAlgorithm<ConvKind::kForward>;
   fwd_result = search::Find<T>(ctx, args, exhaustive_search, deterministic);
@@ -370,7 +365,7 @@ void ConvCudnnKernel(const Context& ctx,
   const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
   auto dtype = phi::backends::gpu::CudnnDataType<T>::type;
 
-#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_HIP)
   // HIP MIOPEN ONLY SUPPORT NCHW format
   auto compute_format = phi::backends::gpu::DataLayout::kNCHW;
 #else
diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
index bcfd4aa1e6d3b..8e71d9f619a5f 100644
--- a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
@@ -613,7 +613,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
 
   int iwo_group = groups;
   int c_group = 1;
-#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
   iwo_group = 1;
   c_group = groups;
   groups = 1;
@@ -670,12 +670,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
   SearchResult<miopenConvBwdDataAlgorithm_t> bwd_result2;
   SearchResult<miopenConvBwdWeightsAlgorithm_t> filter_result;
   SearchResult<miopenConvFwdAlgorithm_t> fwd_result;
-#elif defined(PADDLE_WITH_MUSA)
-  SearchResult<mudnnConvBwdDataAlgorithm_t> bwd_result1;
-  SearchResult<mudnnConvBwdDataAlgorithm_t> bwd_result2;
-  SearchResult<mudnnConvBwdWeightsAlgorithm_t> filter_result;
-  SearchResult<mudnnConvFwdAlgorithm_t> fwd_result;
-#else
+#elif defined(PADDLE_WITH_CUDA)
   SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result1;
   SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result2;
   SearchResult<cudnnConvolutionBwdFilterAlgo_t> filter_result;
diff --git a/paddle/phi/kernels/impl/matmul_kernel_impl.h b/paddle/phi/kernels/impl/matmul_kernel_impl.h
index f9f861b67c740..b88c1dffdd018 100644
--- a/paddle/phi/kernels/impl/matmul_kernel_impl.h
+++ b/paddle/phi/kernels/impl/matmul_kernel_impl.h
@@ -128,17 +128,16 @@ void MatMulFunctionImplWithBlas(
     VLOG(3) << "MatMul's case 1";
     Out->Resize(phi::make_ddim({}));
     dev_ctx.template Alloc<T>(Out);
-    // TODO(@caizhi): enable it
-    //blas.GEMM(CblasNoTrans,
-    //          CblasTrans,
-    //          1,
-    //          1,
-    //          M,
-    //          static_cast<T>(1),
-    //          y_data,
-    //          x_data,
-    //          static_cast<T>(flag),
-    //          dev_ctx.template Alloc<T>(Out));
+    blas.GEMM(CblasNoTrans,
+              CblasTrans,
+              1,
+              1,
+              M,
+              static_cast<T>(1),
+              y_data,
+              x_data,
+              static_cast<T>(flag),
+              dev_ctx.template Alloc<T>(Out));
     return;
   }
 
@@ -179,45 +178,42 @@ void MatMulFunctionImplWithBlas(
     if (trans_y) {
       const int M = Y.numel() / N;
       VLOG(3) << "MatMul's case 2";
-      // TODO(@caizhi): enable it
-      //blas.GEMV(false,
-      //          M,
-      //          N,
-      //          static_cast<T>(1),
-      //          y_data,
-      //          x_data,
-      //          static_cast<T>(flag),
-      //          dev_ctx.template Alloc<T>(Out));
+      blas.GEMV(false,
+                M,
+                N,
+                static_cast<T>(1),
+                y_data,
+                x_data,
+                static_cast<T>(flag),
+                dev_ctx.template Alloc<T>(Out));
     } else {
       const int M = y_dims[y_ndim - 1];
       const int batch_size = Y.numel() / (M * N);
       if (batch_size == 1) {
         VLOG(3) << "MatMul's case 3";
-        // TODO(@caizhi): enable it
-        //blas.GEMV(true,
-        //          N,
-        //          M,
-        //          static_cast<T>(1),
-        //          y_data,
-        //          x_data,
-        //          static_cast<T>(flag),
-        //          dev_ctx.template Alloc<T>(Out));
+        blas.GEMV(true,
+                  N,
+                  M,
+                  static_cast<T>(1),
+                  y_data,
+                  x_data,
+                  static_cast<T>(flag),
+                  dev_ctx.template Alloc<T>(Out));
       } else {
         VLOG(3) << "MatMul's case 4";
-        // TODO(@caizhi): enable it
-        //blas.BatchedGEMM(CblasTrans,
-        //                 CblasNoTrans,
-        //                 M,
-        //                 1,
-        //                 N,
-        //                 static_cast<T>(1),
-        //                 y_data,
-        //                 x_data,
-        //                 static_cast<T>(flag),
-        //                 dev_ctx.template Alloc<T>(Out),
-        //                 batch_size,
-        //                 M * N,
-        //                 0);
+        blas.BatchedGEMM(CblasTrans,
+                         CblasNoTrans,
+                         M,
+                         1,
+                         N,
+                         static_cast<T>(1),
+                         y_data,
+                         x_data,
+                         static_cast<T>(flag),
+                         dev_ctx.template Alloc<T>(Out),
+                         batch_size,
+                         M * N,
+                         0);
       }
     }
     return;
@@ -263,44 +259,41 @@ void MatMulFunctionImplWithBlas(
       const int batch_size = X.numel() / (M * N);
       if (batch_size == 1) {
         VLOG(3) << "MatMul's case 5";
-        // TODO(@caizhi): enable it
-        //blas.GEMV(true,
-        //          N,
-        //          M,
-        //          static_cast<T>(1),
-        //          x_data,
-        //          y_data,
-        //          static_cast<T>(flag),
-        //          dev_ctx.template Alloc<T>(Out));
+        blas.GEMV(true,
+                  N,
+                  M,
+                  static_cast<T>(1),
+                  x_data,
+                  y_data,
+                  static_cast<T>(flag),
+                  dev_ctx.template Alloc<T>(Out));
       } else {
         VLOG(3) << "MatMul's case 6";
-        // TODO(@caizhi): enable it
-        //blas.BatchedGEMM(CblasTrans,
-        //                 CblasNoTrans,
-        //                 M,
-        //                 1,
-        //                 N,
-        //                 static_cast<T>(1),
-        //                 x_data,
-        //                 y_data,
-        //                 static_cast<T>(flag),
-        //                 dev_ctx.template Alloc<T>(Out),
-        //                 batch_size,
-        //                 M * N,
-        //                 0);
+        blas.BatchedGEMM(CblasTrans,
+                         CblasNoTrans,
+                         M,
+                         1,
+                         N,
+                         static_cast<T>(1),
+                         x_data,
+                         y_data,
+                         static_cast<T>(flag),
+                         dev_ctx.template Alloc<T>(Out),
+                         batch_size,
+                         M * N,
+                         0);
       }
     } else {
       const int M = X.numel() / N;
       VLOG(3) << "MatMul's case 7";
-      // TODO(@caizhi): enable it
-      //blas.GEMV(false,
-      //          M,
-      //          N,
-      //          static_cast<T>(1),
-      //          x_data,
-      //          y_data,
-      //          static_cast<T>(flag),
-      //          dev_ctx.template Alloc<T>(Out));
+      blas.GEMV(false,
+                M,
+                N,
+                static_cast<T>(1),
+                x_data,
+                y_data,
+                static_cast<T>(flag),
+                dev_ctx.template Alloc<T>(Out));
     }
     return;
   }
@@ -374,93 +367,87 @@ void MatMulFunctionImplWithBlas(
   if (out_batch_size == 0) return;
   if (x_batch_size == 1 && y_batch_size == 1) {
     VLOG(3) << "MatMul's case 8";
-    // TODO(@caizhi): enable it
-    //blas.GEMM(trans_x ? CblasTrans : CblasNoTrans,
-    //          trans_y ? CblasTrans : CblasNoTrans,
-    //          M,
-    //          N,
-    //          K,
-    //          static_cast<T>(1),
-    //          x_data,
-    //          y_data,
-    //          static_cast<T>(flag),
-    //          dev_ctx.template Alloc<T>(Out));
+    blas.GEMM(trans_x ? CblasTrans : CblasNoTrans,
+              trans_y ? CblasTrans : CblasNoTrans,
+              M,
+              N,
+              K,
+              static_cast<T>(1),
+              x_data,
+              y_data,
+              static_cast<T>(flag),
+              dev_ctx.template Alloc<T>(Out));
   } else if (x_batch_size == 1) {
     if (M == 1 && trans_y) {
       VLOG(3) << "MatMul's case 9";
-      // TODO(@caizhi): enable it
-      //blas.GEMV(false,
-      //          y_batch_size * N,
-      //          K,
-      //          static_cast<T>(1),
-      //          y_data,
-      //          x_data,
-      //          static_cast<T>(flag),
-      //          dev_ctx.template Alloc<T>(Out));
+      blas.GEMV(false,
+                y_batch_size * N,
+                K,
+                static_cast<T>(1),
+                y_data,
+                x_data,
+                static_cast<T>(flag),
+                dev_ctx.template Alloc<T>(Out));
     } else {
       VLOG(3) << "MatMul's case 10";
-      // TODO(@caizhi): enable it
-      //blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
-      //                 trans_y ? CblasTrans : CblasNoTrans,
-      //                 M,
-      //                 N,
-      //                 K,
-      //                 static_cast<T>(1),
-      //                 x_data,
-      //                 y_data,
-      //                 static_cast<T>(flag),
-      //                 dev_ctx.template Alloc<T>(Out),
-      //                 out_batch_size,
-      //                 0,
-      //                 K * N);
+      blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
+                       trans_y ? CblasTrans : CblasNoTrans,
+                       M,
+                       N,
+                       K,
+                       static_cast<T>(1),
+                       x_data,
+                       y_data,
+                       static_cast<T>(flag),
+                       dev_ctx.template Alloc<T>(Out),
+                       out_batch_size,
+                       0,
+                       K * N);
     }
   } else if (y_batch_size == 1) {
     if (!trans_x) {
       VLOG(3) << "MatMul's case 11";
-      // TODO(@caizhi): enable it
-      //blas.GEMM(CblasNoTrans,
-      //          trans_y ? CblasTrans : CblasNoTrans,
-      //          x_batch_size * M,
-      //          N,
-      //          K,
-      //          static_cast<T>(1),
-      //          x_data,
-      //          y_data,
-      //          static_cast<T>(flag),
-      //          dev_ctx.template Alloc<T>(Out));
+      blas.GEMM(CblasNoTrans,
+                trans_y ? CblasTrans : CblasNoTrans,
+                x_batch_size * M,
+                N,
+                K,
+                static_cast<T>(1),
+                x_data,
+                y_data,
+                static_cast<T>(flag),
+                dev_ctx.template Alloc<T>(Out));
     } else {
       VLOG(3) << "MatMul's case 12";
-      // TODO(@caizhi): enable it
-      //blas.BatchedGEMM(CblasTrans,
-      //                 trans_y ? CblasTrans : CblasNoTrans,
-      //                 M,
-      //                 N,
-      //                 K,
-      //                 static_cast<T>(1),
-      //                 x_data,
-      //                 y_data,
-      //                 static_cast<T>(flag),
-      //                 dev_ctx.template Alloc<T>(Out),
-      //                 out_batch_size,
-      //                 M * K,
-      //                 0);
+      blas.BatchedGEMM(CblasTrans,
+                       trans_y ? CblasTrans : CblasNoTrans,
+                       M,
+                       N,
+                       K,
+                       static_cast<T>(1),
+                       x_data,
+                       y_data,
+                       static_cast<T>(flag),
+                       dev_ctx.template Alloc<T>(Out),
+                       out_batch_size,
+                       M * K,
+                       0);
     }
   } else if (!is_broadcast_dims) {
     VLOG(3) << "MatMul's case 13";
-    // TODO(@caizhi): enable it
-    //blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
-    //                 trans_y ? CblasTrans : CblasNoTrans,
-    //                 M,
-    //                 N,
-    //                 K,
-    //                 static_cast<T>(1),
-    //                 x_data,
-    //                 y_data,
-    //                 static_cast<T>(flag),
-    //                 dev_ctx.template Alloc<T>(Out),
-    //                 out_batch_size,
-    //                 M * K,
-    //                 K * N);
+    blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
+                     trans_y ? CblasTrans : CblasNoTrans,
+                     M,
+                     N,
+                     K,
+                     static_cast<T>(1),
+                     x_data,
+                     y_data,
+                     static_cast<T>(flag),
+                     dev_ctx.template Alloc<T>(Out),
+                     out_batch_size,
+                     M * K,
+                     K * N);
   } else {
     // in the case, can't use stridedgemm
     std::vector<const T*> x_ptr(out_batch_size);
@@ -480,18 +467,17 @@ void MatMulFunctionImplWithBlas(
       IndexIncreaseFromDims(batch_dim, out_broadcast_dims.data(), index.data());
     }
     VLOG(3) << "MatMul's case 14";
-    // TODO(@caizhi): enable it
-    //blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
-    //                 trans_y ? CblasTrans : CblasNoTrans,
-    //                 M,
-    //                 N,
-    //                 K,
-    //                 static_cast<T>(1),
-    //                 x_ptr.data(),
-    //                 y_ptr.data(),
-    //                 static_cast<T>(flag),
-    //                 out_ptr.data(),
-    //                 out_batch_size);
+    blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
+                     trans_y ? CblasTrans : CblasNoTrans,
+                     M,
+                     N,
+                     K,
+                     static_cast<T>(1),
+                     x_ptr.data(),
+                     y_ptr.data(),
+                     static_cast<T>(flag),
+                     out_ptr.data(),
+                     out_batch_size);
   }
 }
 
@@ -913,8 +899,8 @@ struct MatMulDispatcher {
                   bool trans_x,
                   bool trans_y,
                   bool flag = false) {
-    //MatMulFunctionImplWithBlas<Context, T>(
-    //    ctx, x, y, x_dims, y_dims, out, trans_x, trans_y, flag);
+    MatMulFunctionImplWithBlas<Context, T>(
+        ctx, x, y, x_dims, y_dims, out, trans_x, trans_y, flag);
   }
 };
 
@@ -997,9 +983,8 @@ void MatmulKernel(const Context& ctx,
                                    " but reviced dims size is 0. "));
   const std::vector<std::int64_t> x_dims = vectorize(x.dims());
   const std::vector<std::int64_t> y_dims = vectorize(y.dims());
-  // TODO(@caizhi): enable it
-  //MatMulFunction<Context, T>(
-  //    ctx, x, y, x_dims, y_dims, out, transpose_x, transpose_y);
+  MatMulFunction<Context, T>(
+      ctx, x, y, x_dims, y_dims, out, transpose_x, transpose_y);
 }
 
 template <typename T, typename Context>
@@ -1022,9 +1007,7 @@ void MatmulWithFlattenKernel(const Context& dev_ctx,
 
   auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
  
-  (void)blas;
-  // TODO(@caizhi): enable it
-  //blas.MatMul(x_matrix, y_matrix, out);
+  blas.MatMul(x_matrix, y_matrix, out);
   if (z_dim.size() != 2) {
     out->Resize(z_dim);
   }

From c15d408d354865ce3885b3e4992bdafb8550c279 Mon Sep 17 00:00:00 2001
From: CaiZhi <zhi.cai@mthreads.com>
Date: Tue, 15 Aug 2023 12:51:34 +0800
Subject: [PATCH 48/55] [MTAI-484] fix(build): optimize new files for MUSA

---
 paddle/fluid/framework/var_type_traits.cc     |  1 -
 paddle/fluid/framework/var_type_traits.h      | 10 ++++-
 paddle/fluid/operators/mudnn_rnn_cache.h      | 33 --------------
 paddle/fluid/platform/device/gpu/gpu_helper.h |  4 +-
 .../platform/device/gpu/musa/musa_helper.h    |  0
 paddle/fluid/platform/device_context.h        |  1 -
 paddle/fluid/platform/dynload/mudnn.h         |  0
 paddle/fluid/platform/dynload/musartc.cc      | 13 ++++--
 paddle/fluid/platform/dynload/musartc.h       | 31 ++++++++++++-
 paddle/phi/backends/dynload/mudnn.cc          | 22 +++++++++-
 paddle/phi/backends/dynload/mudnn.h           | 25 ++++++++++-
 .../backends/gpu/musa/musa_device_function.h  |  6 +--
 paddle/phi/backends/gpu/musa/musa_info.cc     | 43 ++++++++++---------
 paddle/phi/backends/musartc.h                 | 24 -----------
 paddle/phi/kernels/funcs/mufft_util.h         |  0
 paddle/phi/kernels/funcs/sparse/sparse_blas.h |  3 --
 .../funcs/sparse/sparse_blas_impl.mu.h        |  3 --
 paddle/phi/kernels/gpu/mudnn_lstm_cache.h     |  0
 18 files changed, 117 insertions(+), 102 deletions(-)
 delete mode 100644 paddle/fluid/operators/mudnn_rnn_cache.h
 delete mode 100644 paddle/fluid/platform/device/gpu/musa/musa_helper.h
 delete mode 100644 paddle/fluid/platform/dynload/mudnn.h
 delete mode 100644 paddle/phi/backends/musartc.h
 delete mode 100644 paddle/phi/kernels/funcs/mufft_util.h
 delete mode 100644 paddle/phi/kernels/funcs/sparse/sparse_blas_impl.mu.h
 delete mode 100644 paddle/phi/kernels/gpu/mudnn_lstm_cache.h

diff --git a/paddle/fluid/framework/var_type_traits.cc b/paddle/fluid/framework/var_type_traits.cc
index 132da0d177178..0b289b8a6ddff 100644
--- a/paddle/fluid/framework/var_type_traits.cc
+++ b/paddle/fluid/framework/var_type_traits.cc
@@ -42,7 +42,6 @@
 #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"   // NOLINT
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"  // NOLINT
 #endif
-#include "paddle/fluid/operators/mudnn_rnn_cache.h"
 #endif
 
 #if defined(PADDLE_WITH_XPU_BKCL)
diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h
index 286ee379d82dd..fe0eed9719c5e 100644
--- a/paddle/fluid/framework/var_type_traits.h
+++ b/paddle/fluid/framework/var_type_traits.h
@@ -33,6 +33,12 @@
 #include <nccl.h>
 #endif
 #endif
+#ifdef PADDLE_WITH_MUSA
+#include <mudnn.h>
+#if defined(PADDLE_WITH_MCCL)
+#include <mccl.h>
+#endif
+#endif
 #ifdef PADDLE_WITH_HIP
 #include <miopen/miopen.h>
 #ifdef PADDLE_WITH_RCCL
@@ -190,13 +196,15 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl<
     FeedList,
     operators::reader::OrderedMultiDeviceLoDTensorBlockingQueueHolder,
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
     ncclUniqueId,
     platform::Communicator,
     platform::NCCLCommunicator,
 #endif
+#ifndef PADDLE_WITH_MUSA
     operators::CudnnRNNCache,
 #endif
+#endif
 #if defined(PADDLE_WITH_XPU_BKCL)
     BKCLUniqueId,
     platform::BKCLCommunicator,
diff --git a/paddle/fluid/operators/mudnn_rnn_cache.h b/paddle/fluid/operators/mudnn_rnn_cache.h
deleted file mode 100644
index af9ebd800fa3c..0000000000000
--- a/paddle/fluid/operators/mudnn_rnn_cache.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
-
-namespace paddle {
-namespace operators {
-
-struct CudnnRNNCache {
-  CudnnRNNCache() {
-  }
-  ~CudnnRNNCache() {}
-};
-
-}  // namespace operators
-}  // namespace paddle
-
diff --git a/paddle/fluid/platform/device/gpu/gpu_helper.h b/paddle/fluid/platform/device/gpu/gpu_helper.h
index f94f5d55b7eee..df6fda56f2b13 100644
--- a/paddle/fluid/platform/device/gpu/gpu_helper.h
+++ b/paddle/fluid/platform/device/gpu/gpu_helper.h
@@ -17,9 +17,7 @@
 
 #ifdef PADDLE_WITH_HIP
 #include "paddle/fluid/platform/device/gpu/rocm/rocm_helper.h"
-#elif defined(PADDLE_WITH_MUSA)
-#include "paddle/fluid/platform/device/gpu/musa/musa_helper.h"
-#else
+#elif defined(PADDLE_WITH_CUDA)
 #include "paddle/fluid/platform/device/gpu/cuda/cuda_helper.h"
 #include "paddle/fluid/platform/device/gpu/cuda/cusparse_helper.h"
 #endif
diff --git a/paddle/fluid/platform/device/gpu/musa/musa_helper.h b/paddle/fluid/platform/device/gpu/musa/musa_helper.h
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 6cd13cb4e4f0a..453d9e9f1e18d 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -45,7 +45,6 @@ limitations under the License. */
 #ifdef PADDLE_WITH_MUSA
 #include "paddle/fluid/platform/device/gpu/gpu_helper.h"
 #include "paddle/fluid/platform/dynload/mublas.h"
-#include "paddle/fluid/platform/dynload/mudnn.h"
 #include "paddle/fluid/platform/dynload/musparse.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #if !defined(__APPLE__) && defined(PADDLE_WITH_MCCL)
diff --git a/paddle/fluid/platform/dynload/mudnn.h b/paddle/fluid/platform/dynload/mudnn.h
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/paddle/fluid/platform/dynload/musartc.cc b/paddle/fluid/platform/dynload/musartc.cc
index 5bc7b6737b3fb..4e15dab9c1359 100644
--- a/paddle/fluid/platform/dynload/musartc.cc
+++ b/paddle/fluid/platform/dynload/musartc.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,13 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/platform/dynload/musartc.h"
+
+#include "paddle/phi/backends/dynload/musartc.h"
+
 namespace paddle {
 namespace platform {
 namespace dynload {
 
-bool HasNVRTC() { return false; }
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+MUSARTC_ROUTINE_EACH(DEFINE_WRAP);
+
+bool HasNVRTC() { return phi::dynload::HasNVRTC(); }
 
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
-
diff --git a/paddle/fluid/platform/dynload/musartc.h b/paddle/fluid/platform/dynload/musartc.h
index a81254119de57..c383c85d7ab04 100644
--- a/paddle/fluid/platform/dynload/musartc.h
+++ b/paddle/fluid/platform/dynload/musartc.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,13 +14,40 @@ limitations under the License. */
 
 #pragma once
 
+#include <mtrtc.h>
+
+#include <mutex>  // NOLINT
+
+#include "paddle/phi/backends/dynload/musartc.h"
+
 namespace paddle {
 namespace platform {
 namespace dynload {
 
 extern bool HasNVRTC();
 
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_NVRTC_WRAP(__name)     \
+  using DynLoad__##__name = phi::dynload::DynLoad__##__name; \
+  extern DynLoad__##__name __name
+
+/**
+ * include all needed musartc functions
+ **/
+#define MUSARTC_ROUTINE_EACH(__macro) \
+  __macro(mtrtcVersion);              \
+  __macro(mtrtcGetErrorString);       \
+  __macro(mtrtcCompileProgram);       \
+  __macro(mtrtcCreateProgram);        \
+  __macro(mtrtcDestroyProgram);       \
+  __macro(mtrtcGetMUSA);              \
+  __macro(mtrtcGetMUSASize);          \
+  __macro(mtrtcGetProgramLog);        \
+  __macro(mtrtcGetProgramLogSize)
+
+MUSARTC_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NVRTC_WRAP);
+
+#undef PLATFORM_DECLARE_DYNAMIC_LOAD_NVRTC_WRAP
+
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
-
diff --git a/paddle/phi/backends/dynload/mudnn.cc b/paddle/phi/backends/dynload/mudnn.cc
index 19ada8408ed17..4e127b8cc001c 100644
--- a/paddle/phi/backends/dynload/mudnn.cc
+++ b/paddle/phi/backends/dynload/mudnn.cc
@@ -12,13 +12,31 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/phi/backends/dynload/mudnn.h"
+
+#include "paddle/phi/core/enforce.h"
+
 namespace phi {
 namespace dynload {
 
+std::once_flag mudnn_dso_flag;
+void* mudnn_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
 bool HasCUDNN() {
-  return false;
+  std::call_once(mudnn_dso_flag,
+                 []() { mudnn_dso_handle = GetCUDNNDsoHandle(); });
+  return mudnn_dso_handle != nullptr;
+}
+
+void EnforceCUDNNLoaded(const char* fn_name) {
+  PADDLE_ENFORCE_NOT_NULL(
+      mudnn_dso_handle,
+      phi::errors::PreconditionNotMet(
+          "Cannot load mudnn shared library. Cannot invoke method %s.",
+          fn_name));
 }
 
 }  // namespace dynload
 }  // namespace phi
-
diff --git a/paddle/phi/backends/dynload/mudnn.h b/paddle/phi/backends/dynload/mudnn.h
index c96a2570210d2..ed4142d61dea4 100644
--- a/paddle/phi/backends/dynload/mudnn.h
+++ b/paddle/phi/backends/dynload/mudnn.h
@@ -14,15 +14,36 @@ limitations under the License. */
 
 #pragma once
 #ifdef PADDLE_WITH_MUSA
+#include <mudnn.h>
+
+#include <mutex>  // NOLINT
+
+#include "paddle/phi/backends/dynload/dynamic_loader.h"
+#include "paddle/phi/backends/dynload/port.h"
 
 namespace phi {
 namespace dynload {
 
+extern std::once_flag mudnn_dso_flag;
+extern void* mudnn_dso_handle;
 extern bool HasCUDNN();
 
+extern void EnforceCUDNNLoaded(const char* fn_name);
+#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)                      \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+      using mudnn_func = decltype(&::__name);                        \
+      std::call_once(mudnn_dso_flag, []() {                          \
+        mudnn_dso_handle = phi::dynload::GetCUDNNDsoHandle();        \
+      });                                                            \
+      EnforceCUDNNLoaded(#__name);                                   \
+      static void* p_##__name = dlsym(mudnn_dso_handle, #__name);    \
+      return reinterpret_cast<mudnn_func>(p_##__name)(args...);      \
+    }                                                                \
+  };                                                                 \
+  extern struct DynLoad__##__name __name
 
 }  // namespace dynload
 }  // namespace phi
-
 #endif
-
diff --git a/paddle/phi/backends/gpu/musa/musa_device_function.h b/paddle/phi/backends/gpu/musa/musa_device_function.h
index 3f0c6b6726849..5745af0212e3a 100644
--- a/paddle/phi/backends/gpu/musa/musa_device_function.h
+++ b/paddle/phi/backends/gpu/musa/musa_device_function.h
@@ -100,7 +100,8 @@ __forceinline__ __device__ phi::dtype::complex<double> CudaShuffleDownSync(
                                            width));
   return phi::dtype::complex<double>(real, imag);
 }
-#if 0
+
+// TODO(@MTAI): there is compiling error when compiling the following code
 //template <>
 //__forceinline__ __device__ phi::dtype::float16 CudaShuffleXorSync(
 //    unsigned mask, phi::dtype::float16 val, int width) {
@@ -112,7 +113,7 @@ __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleXorSync(
     unsigned mask, phi::dtype::bfloat16 val, int width) {
 #if defined(PADDLE_MUSA_BF16)
   return phi::dtype::bfloat16(
-      __shfl_xor_sync(mask, val.to_nv_bfloat16(), width));
+      __shfl_xor_sync(mask, val.to_mt_bfloat16(), width));
 #else
   PADDLE_ENFORCE(
       false, "__shfl_xor_sync with bfloat16 is not supported on cuda <= 11.");
@@ -149,7 +150,6 @@ template <typename T>
 HOSTDEVICE T Infinity() {
   return INFINITY;
 }
-#endif
 
 template <typename T>
 __device__ T reduceSum(T val, int tid, int len) {
diff --git a/paddle/phi/backends/gpu/musa/musa_info.cc b/paddle/phi/backends/gpu/musa/musa_info.cc
index ced106d6c6b3d..a7f2f8dbb166d 100644
--- a/paddle/phi/backends/gpu/musa/musa_info.cc
+++ b/paddle/phi/backends/gpu/musa/musa_info.cc
@@ -31,12 +31,13 @@ namespace backends {
 namespace gpu {
 
 int DnnVersion() {
-  return 0;
-  //if (!dynload::HasCUDNN()) return -1;
-  //size_t version_major, version_minor, version_patch;
-  //PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenGetVersion(
-  //    &version_major, &version_minor, &version_patch));
-  //return version_major * 100 + version_minor * 10 + version_patch;
+  if (!dynload::HasCUDNN()) return -1;
+  // TODO(@caizhi): mudnnGetVersion is not supported now.
+  // version info will be returned from mudnnGetVersion later.
+  const int version_major = 1;
+  const int version_minor = 1;
+  const int version_patch = 0;
+  return version_major * 1000 + version_minor * 100 + version_patch;
 }
 
 static int GetGPUDeviceCountImpl() {
@@ -49,22 +50,22 @@ static int GetGPUDeviceCountImpl() {
     return 0;
   }
 
-  const auto *cuda_visible_devices = std::getenv("MUSA_VISIBLE_DEVICES");
-
-  if (cuda_visible_devices != nullptr) {
-    std::string cuda_visible_devices_str(cuda_visible_devices);
-    if (!cuda_visible_devices_str.empty()) {
-      cuda_visible_devices_str.erase(
-          0, cuda_visible_devices_str.find_first_not_of('\''));
-      cuda_visible_devices_str.erase(
-          cuda_visible_devices_str.find_last_not_of('\'') + 1);
-      cuda_visible_devices_str.erase(
-          0, cuda_visible_devices_str.find_first_not_of('\"'));
-      cuda_visible_devices_str.erase(
-          cuda_visible_devices_str.find_last_not_of('\"') + 1);
+  const auto *musa_visible_devices = std::getenv("MUSA_VISIBLE_DEVICES");
+
+  if (musa_visible_devices != nullptr) {
+    std::string musa_visible_devices_str(musa_visible_devices);
+    if (!musa_visible_devices_str.empty()) {
+      musa_visible_devices_str.erase(
+          0, musa_visible_devices_str.find_first_not_of('\''));
+      musa_visible_devices_str.erase(
+          musa_visible_devices_str.find_last_not_of('\'') + 1);
+      musa_visible_devices_str.erase(
+          0, musa_visible_devices_str.find_first_not_of('\"'));
+      musa_visible_devices_str.erase(
+          musa_visible_devices_str.find_last_not_of('\"') + 1);
     }
-    if (std::all_of(cuda_visible_devices_str.begin(),
-                    cuda_visible_devices_str.end(),
+    if (std::all_of(musa_visible_devices_str.begin(),
+                    musa_visible_devices_str.end(),
                     [](char ch) { return ch == ' '; })) {
       VLOG(2) << "MUSA_VISIBLE_DEVICES is set to be "
                  "empty. No GPU detected.";
diff --git a/paddle/phi/backends/musartc.h b/paddle/phi/backends/musartc.h
deleted file mode 100644
index dc9ebc3faf0d7..0000000000000
--- a/paddle/phi/backends/musartc.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-namespace phi {
-namespace dynload {
-
-extern bool HasNVRTC();
-
-}  // namespace dynload
-}  // namespace phi
-
diff --git a/paddle/phi/kernels/funcs/mufft_util.h b/paddle/phi/kernels/funcs/mufft_util.h
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/paddle/phi/kernels/funcs/sparse/sparse_blas.h b/paddle/phi/kernels/funcs/sparse/sparse_blas.h
index 9a6534c32a1c6..f6d67488d1f48 100644
--- a/paddle/phi/kernels/funcs/sparse/sparse_blas.h
+++ b/paddle/phi/kernels/funcs/sparse/sparse_blas.h
@@ -100,6 +100,3 @@ inline SparseBlasT<DeviceContext, T> GetSparseBlas(
 #if defined(PADDLE_WITH_HIP) && HIP_VERSION >= 402
 #include "paddle/phi/kernels/funcs/sparse/sparse_blas_impl.hip.h"
 #endif
-#if defined(PADDLE_WITH_MUSA)
-#include "paddle/phi/kernels/funcs/sparse/sparse_blas_impl.mu.h"
-#endif
diff --git a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.mu.h b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.mu.h
deleted file mode 100644
index 1f4ffb82624a4..0000000000000
--- a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.mu.h
+++ /dev/null
@@ -1,3 +0,0 @@
-#pragma once
-
-#include "glog/logging.h"
diff --git a/paddle/phi/kernels/gpu/mudnn_lstm_cache.h b/paddle/phi/kernels/gpu/mudnn_lstm_cache.h
deleted file mode 100644
index e69de29bb2d1d..0000000000000

From 0b8290c8e6ef32a0f8ce71dcfb20a70515c502c3 Mon Sep 17 00:00:00 2001
From: CaiZhi <zhi.cai@mthreads.com>
Date: Tue, 15 Aug 2023 16:19:35 +0800
Subject: [PATCH 49/55] [MTAI-484] fix(code-style): modify code format for
 cpplint check

---
 .pre-commit-config.yaml                       |   2 +-
 cmake/generic.cmake                           |   8 +-
 cmake/mccl.cmake                              |  13 +-
 cmake/mudnn.cmake                             |   6 +-
 cmake/musa.cmake                              |  27 ++--
 cmake/version.cmake                           |   7 +-
 .../distributed/fleet_executor/carrier.cc     |   3 +-
 .../fleet_executor/cond_interceptor.cc        |   3 +-
 .../distributed/fleet_executor/dist_model.cc  |   3 +-
 paddle/fluid/eager/nan_inf_utils.cc           |   3 +-
 .../fluid/framework/copy_same_tensor_test.cc  |   3 +-
 paddle/fluid/framework/custom_operator.cc     |   6 +-
 paddle/fluid/framework/data_feed.cc           |   4 +-
 paddle/fluid/framework/data_feed.h            |   4 +-
 paddle/fluid/framework/data_feed_factory.cc   |   4 +-
 paddle/fluid/framework/details/CMakeLists.txt | 114 ++++++++-----
 .../fluid/framework/details/build_strategy.cc |   6 +-
 .../details/eager_deletion_op_handle.cc       |  18 ++-
 .../details/eager_deletion_op_handle.h        |   3 +-
 .../details/fetch_async_op_handle.cc          |   3 +-
 .../framework/details/fetch_op_handle.cc      |   3 +-
 .../details/fused_all_reduce_op_handle.cc     |   9 +-
 .../framework/details/nan_inf_utils_detail.cc |   3 +-
 .../fluid/framework/details/op_handle_base.cc |  27 ++--
 .../fluid/framework/details/op_handle_base.h  |   3 +-
 .../details/reduce_op_handle_test.cc          |   3 +-
 .../details/scale_loss_grad_op_handle.cc      |   6 +-
 .../details/share_tensor_buffer_op_handle.cc  |   3 +-
 paddle/fluid/framework/details/var_handle.h   |   6 +-
 paddle/fluid/framework/device_worker.h        |  22 ++-
 paddle/fluid/framework/dlpack_tensor_test.cc  |   3 +-
 paddle/fluid/framework/executor.cc            |   3 +-
 .../fluid/framework/fleet/box_wrapper_impl.h  |   8 +-
 paddle/fluid/framework/fleet/fleet_wrapper.cc |   3 +-
 paddle/fluid/framework/fleet/fleet_wrapper.h  |   3 +-
 paddle/fluid/framework/fleet/heter_wrapper.cc |   9 +-
 paddle/fluid/framework/fleet/heter_wrapper.h  |   3 +-
 paddle/fluid/framework/garbage_collector.cc   |   6 +-
 paddle/fluid/framework/garbage_collector.h    |   3 +-
 paddle/fluid/framework/ir/CMakeLists.txt      |   8 +-
 paddle/fluid/framework/ir/cost_model.cc       |   6 +-
 paddle/fluid/framework/ir/fuse_bn_act_pass.cc |   6 +-
 .../ir/fusion_group/code_generator_tester.cc  |   3 +-
 .../interpreter/interpreter_util.cc           |   3 +-
 .../new_executor/interpreter_base_impl.h      |   3 +-
 .../new_executor/new_ir_interpreter.cc        |   6 +-
 .../fluid/framework/new_executor/profiler.h   |   3 +-
 .../new_executor/program_interpreter.cc       |   6 +-
 paddle/fluid/framework/operator.cc            |  18 ++-
 paddle/fluid/framework/operator.h             |   3 +-
 paddle/fluid/framework/parallel_executor.cc   |  25 ++-
 paddle/fluid/framework/pull_dense_worker.cc   |  25 +--
 paddle/fluid/framework/section_worker.cc      |   3 +-
 paddle/fluid/framework/tensor_test.cc         |  15 +-
 paddle/fluid/framework/tensor_util_test.cc    |  15 +-
 paddle/fluid/framework/var_type_traits.h      |   9 +-
 paddle/fluid/imperative/amp_auto_cast.cc      |   3 +-
 .../fluid/imperative/gradient_accumulator.cc  |  27 ++--
 paddle/fluid/imperative/prepared_operator.cc  |   9 +-
 paddle/fluid/imperative/tracer.cc             |   9 +-
 .../ir_params_sync_among_devices_pass.cc      |   6 +-
 .../ir_params_sync_among_devices_pass.h       |   3 +-
 .../fluid/inference/api/analysis_predictor.cc |  30 ++--
 .../fluid/inference/api/analysis_predictor.h  |   3 +-
 .../inference/api/details/zero_copy_tensor.cc |  15 +-
 paddle/fluid/inference/api/infer_context.cc   |   3 +-
 paddle/phi/kernels/CMakeLists.txt             | 150 +++++++++---------
 paddle/phi/kernels/activation_kernel.cc       |   3 +-
 paddle/phi/kernels/assign_kernel.cc           |   3 +-
 .../kernels/check_memory_continue_kernel.cc   |   3 +-
 paddle/phi/kernels/dist_grad_kernel.cc        |   3 +-
 paddle/phi/kernels/flatten_grad_kernel.cc     |   3 +-
 .../kernels/impl/segment_pool_kernel_impl.h   |   3 +-
 .../phi/kernels/impl/warprnnt_kernel_impl.h   |   3 +-
 paddle/phi/kernels/kps/elementwise_kernel.cu  |   3 +-
 paddle/phi/kernels/layer_norm_kernel.h        |   3 +-
 paddle/phi/kernels/memcpy_kernel.cc           |   6 +-
 paddle/phi/kernels/npu_identity_kernel.cc     |   3 +-
 paddle/phi/kernels/prod_kernel.cc             |   3 +-
 paddle/phi/kernels/reduce_all_kernel.cc       |   3 +-
 paddle/phi/kernels/reduce_amax_kernel.cc      |   3 +-
 paddle/phi/kernels/reduce_amin_kernel.cc      |   3 +-
 paddle/phi/kernels/reduce_any_kernel.cc       |   3 +-
 paddle/phi/kernels/reduce_mean_kernel.cc      |   3 +-
 paddle/phi/kernels/reduce_min_kernel.cc       |  11 +-
 paddle/phi/kernels/reduce_sum_kernel.cc       |   3 +-
 .../selected_rows/activation_kernel.cc        |   3 +-
 .../phi/kernels/selected_rows/full_kernel.cc  |   6 +-
 .../kernels/selected_rows/isfinite_kernel.cc  |   6 +-
 .../merge_selected_rows_kernel.cc             |   3 +-
 .../kernels/selected_rows/uniform_kernel.cc   |   3 +-
 paddle/phi/kernels/sparse/empty_kernel.cc     |   3 +-
 .../kernels/sparse/gpu/sparse_utils_kernel.cu |  10 +-
 .../sparse/sparse_utils_grad_kernel.cc        |   3 +-
 paddle/phi/kernels/squeeze_kernel.cc          |   3 +-
 paddle/phi/kernels/strings/gpu/copy_utils.h   |   6 +-
 .../kernels/strings/strings_empty_kernel.cc   |   3 +-
 paddle/phi/kernels/strings/unicode.h          |   3 +-
 paddle/phi/kernels/transfer_layout_kernel.cc  |   6 +-
 paddle/phi/kernels/unsqueeze_grad_kernel.cc   |   3 +-
 paddle/phi/kernels/unsqueeze_kernel.cc        |   3 +-
 paddle/testing/paddle_gtest_main.cc           |   6 +-
 102 files changed, 578 insertions(+), 341 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index d66288ac48580..24b9f0e3cb877 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -97,7 +97,7 @@ repos:
         files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx)$
         args:
             - --extensions=c,cc,cxx,cpp,cu,cuh,h,hpp,hxx,kps
-            - --filter=-readability/fn_size,-build/include_what_you_use,-build/c++11,-whitespace/parens
+            - --filter=-readability/fn_size,-build/include_what_you_use,-build/c++11,-whitespace/parens,-whitespace/braces
             - --quiet
         # Exclude third-party libraries
         exclude:  |
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index d7138ba1d596e..117cee63daf4f 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -1356,8 +1356,12 @@ function(math_library TARGET)
   elseif(WITH_MUSA)
     musa_library(
       ${TARGET}
-      SRCS ${cc_srcs} ${cu_srcs}
-      DEPS ${math_library_DEPS} ${math_common_deps})
+      SRCS
+      ${cc_srcs}
+      ${cu_srcs}
+      DEPS
+      ${math_library_DEPS}
+      ${math_common_deps})
   elseif(${cc_srcs_len} GREATER 0)
     cc_library(
       ${TARGET}
diff --git a/cmake/mccl.cmake b/cmake/mccl.cmake
index 86b8b42862791..fd32500458161 100644
--- a/cmake/mccl.cmake
+++ b/cmake/mccl.cmake
@@ -23,15 +23,15 @@ if(WITH_MCCL)
     string(REGEX MATCH "define MCCL_MAJOR +([0-9]+)" MCCL_MAJOR_VERSION
                  "${MCCL_VERSION_FILE_CONTENTS}")
     string(REGEX REPLACE "define MCCL_MAJOR +([0-9]+)" "\\1" MCCL_MAJOR_VERSION
-                 "${MCCL_MAJOR_VERSION}")
+                         "${MCCL_MAJOR_VERSION}")
     string(REGEX MATCH "define MCCL_MINOR +([0-9]+)" MCCL_MINOR_VERSION
                  "${MCCL_VERSION_FILE_CONTENTS}")
     string(REGEX REPLACE "define MCCL_MINOR +([0-9]+)" "\\1" MCCL_MINOR_VERSION
-    		 "${MCCL_MINOR_VERSION}")
+                         "${MCCL_MINOR_VERSION}")
     string(REGEX MATCH "define MCCL_PATCH +([0-9]+)" MCCL_PATCH_VERSION
                  "${MCCL_VERSION_FILE_CONTENTS}")
     string(REGEX REPLACE "define MCCL_PATCH +([0-9]+)" "\\1" MCCL_PATCH_VERSION
-                 "${MCCL_PATCH_VERSION}")
+                         "${MCCL_PATCH_VERSION}")
     if(NOT MCCL_MAJOR_VERSION)
       set(MCCL_VERSION "???")
     else()
@@ -42,10 +42,11 @@ if(WITH_MCCL)
     include_directories(${MCCL_INCLUDE_DIR})
 
     message(STATUS "Current MCCL header is ${MCCL_INCLUDE_DIR}/mccl.h. ")
-    message(STATUS "Current MCCL version is "
+    message(
+      STATUS
+        "Current MCCL version is "
         "v${MCCL_MAJOR_VERSION}.${MCCL_MINOR_VERSION}.${MCCL_PATCH_VERSION} ")
   else()
-	  message(FATAL_ERROR "WITH_MCCL is enabled but mccl.h file is not found!")
+    message(FATAL_ERROR "WITH_MCCL is enabled but mccl.h file is not found!")
   endif()
 endif()
-
diff --git a/cmake/mudnn.cmake b/cmake/mudnn.cmake
index ab66620ad4c26..81027890d144e 100644
--- a/cmake/mudnn.cmake
+++ b/cmake/mudnn.cmake
@@ -62,15 +62,15 @@ macro(find_mudnn_version mudnn_version_file)
   string(REGEX MATCH "define MUDNN_VERSION_MAJOR +([0-9]+)" MUDNN_MAJOR_VERSION
                "${MUDNN_VERSION_FILE_CONTENTS}")
   string(REGEX REPLACE "define MUDNN_VERSION_MAJOR +([0-9]+)" "\\1"
-               MUDNN_MAJOR_VERSION "${MUDNN_MAJOR_VERSION}")
+                       MUDNN_MAJOR_VERSION "${MUDNN_MAJOR_VERSION}")
   string(REGEX MATCH "define MUDNN_VERSION_MINOR +([0-9]+)" MUDNN_MINOR_VERSION
                "${MUDNN_VERSION_FILE_CONTENTS}")
   string(REGEX REPLACE "define MUDNN_VERSION_MINOR +([0-9]+)" "\\1"
-	       MUDNN_MINOR_VERSION "${MUDNN_MINOR_VERSION}")
+                       MUDNN_MINOR_VERSION "${MUDNN_MINOR_VERSION}")
   string(REGEX MATCH "define MUDNN_VERSION_PATCH +([0-9]+)" MUDNN_PATCH_VERSION
                "${MUDNN_VERSION_FILE_CONTENTS}")
   string(REGEX REPLACE "define MUDNN_VERSION_PATCH +([0-9]+)" "\\1"
-               MUDNN_PATCH_VERSION "${MUDNN_PATCH_VERSION}")
+                       MUDNN_PATCH_VERSION "${MUDNN_PATCH_VERSION}")
 
   if(NOT MUDNN_MAJOR_VERSION)
     set(MUDNN_VERSION "???")
diff --git a/cmake/musa.cmake b/cmake/musa.cmake
index e19745aef1ba1..81116fbbdef8b 100644
--- a/cmake/musa.cmake
+++ b/cmake/musa.cmake
@@ -24,14 +24,15 @@ endforeach()
 
 find_path(
   OPENMP_INCLUDE_DIR omp.h
-  PATHS ${llvm_openmp_search_list}
-  REQUIRED
+  PATHS ${llvm_openmp_search_list} REQUIRED
   NO_DEFAULT_PATH)
 include_directories(${OPENMP_INCLUDE_DIR})
 
 macro(find_musa_version musa_version_file)
   set(python_file ${PROJECT_BINARY_DIR}/get_version.py)
-  set(MUSA_VERSION "None" CACHE STRING "musa version" FORCE)
+  set(MUSA_VERSION
+      "None"
+      CACHE STRING "musa version" FORCE)
   file(
     WRITE ${python_file}
     ""
@@ -52,27 +53,27 @@ macro(find_musa_version musa_version_file)
   if(python_res EQUAL 0)
     set(MUSA_VERSION ${python_out})
   endif()
-  string(REGEX REPLACE "([0-9]+)\.([0-9]+)\.([0-9]+)" "\\1" MUSA_MAJOR_VERSION "${MUSA_VERSION}")
-  string(REGEX REPLACE "([0-9]+)\.([0-9]+)\.([0-9]+)" "\\2" MUSA_MINOR_VERSION "${MUSA_VERSION}")
-  string(REGEX REPLACE "([0-9]+)\.([0-9]+)\.([0-9]+)" "\\3" MUSA_PATCH_VERSION "${MUSA_VERSION}")
+  string(REGEX REPLACE "([0-9]+)\.([0-9]+)\.([0-9]+)" "\\1" MUSA_MAJOR_VERSION
+                       "${MUSA_VERSION}")
+  string(REGEX REPLACE "([0-9]+)\.([0-9]+)\.([0-9]+)" "\\2" MUSA_MINOR_VERSION
+                       "${MUSA_VERSION}")
+  string(REGEX REPLACE "([0-9]+)\.([0-9]+)\.([0-9]+)" "\\3" MUSA_PATCH_VERSION
+                       "${MUSA_VERSION}")
 
   if(NOT MUSA_MAJOR_VERSION)
     set(MUSA_VERSION "???")
-    message(
-      WARNING "Cannot find MUSA version in ${MUSA_PATH}/version.json"
-    )
+    message(WARNING "Cannot find MUSA version in ${MUSA_PATH}/version.json")
   else()
     math(
       EXPR
       MUSA_VERSION
       "${MUSA_MAJOR_VERSION} * 10000 + ${MUSA_MINOR_VERSION} * 100   + ${MUSA_PATCH_VERSION}"
     )
+    message(STATUS "Current MUSA version file is ${MUSA_PATH}/version.json.")
     message(
       STATUS
-        "Current MUSA version file is ${MUSA_PATH}/version.json.")
-    message(
-      STATUS
-	"Current MUSA version is v${MUSA_MAJOR_VERSION}.${MUSA_MINOR_VERSION}.${MUSA_PATCH_VERSION} ")
+        "Current MUSA version is v${MUSA_MAJOR_VERSION}.${MUSA_MINOR_VERSION}.${MUSA_PATCH_VERSION} "
+    )
   endif()
 endmacro()
 find_musa_version(${MUSA_PATH}/version.json)
diff --git a/cmake/version.cmake b/cmake/version.cmake
index 039ae58a64c00..6b1905352bbad 100644
--- a/cmake/version.cmake
+++ b/cmake/version.cmake
@@ -94,9 +94,10 @@ function(version version_file)
          "CUDNN version: v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}\n")
   endif()
   if(WITH_MUSA)
-    file(APPEND ${version_file}
-         "MUSA version: v${MUSA_MAJOR_VERSION}.${MUSA_MINOR_VERSION}.${MUSA_PATCH_VERSION}\n"
-         "MUDNN version: v${MUDNN_MAJOR_VERSION}.${MUDNN_MINOR_VERSION}\n")
+    file(
+      APPEND ${version_file}
+      "MUSA version: v${MUSA_MAJOR_VERSION}.${MUSA_MINOR_VERSION}.${MUSA_PATCH_VERSION}\n"
+      "MUDNN version: v${MUDNN_MAJOR_VERSION}.${MUDNN_MINOR_VERSION}\n")
   endif()
   if(WITH_ROCM)
     file(APPEND ${version_file}
diff --git a/paddle/fluid/distributed/fleet_executor/carrier.cc b/paddle/fluid/distributed/fleet_executor/carrier.cc
index dc92bb8f699d6..06c27f1d205c1 100644
--- a/paddle/fluid/distributed/fleet_executor/carrier.cc
+++ b/paddle/fluid/distributed/fleet_executor/carrier.cc
@@ -272,7 +272,8 @@ static std::shared_ptr<framework::GarbageCollector> GetGC(
   int64_t max_memory_size = framework::GetEagerDeletionThreshold();
   std::shared_ptr<framework::GarbageCollector> gc;
   if (max_memory_size >= 0) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     if (platform::is_gpu_place(place)) {
       if (framework::IsFastEagerDeletionModeEnabled()) {
         gc.reset(new framework::UnsafeFastGPUGarbageCollector(place,
diff --git a/paddle/fluid/distributed/fleet_executor/cond_interceptor.cc b/paddle/fluid/distributed/fleet_executor/cond_interceptor.cc
index df284822390d0..02955f46018f6 100644
--- a/paddle/fluid/distributed/fleet_executor/cond_interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/cond_interceptor.cc
@@ -71,7 +71,8 @@ bool CondInterceptor::GetCondResult() {
   const auto& cond_tensor = cond_var->Get<phi::DenseTensor>();
   bool res = false;
   if (platform::is_gpu_place(cond_tensor.place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     phi::DenseTensor cpu_tensor;
     framework::TensorCopy(cond_tensor, platform::CPUPlace(), &cpu_tensor);
     platform::DeviceContextPool::Instance().Get(cond_tensor.place())->Wait();
diff --git a/paddle/fluid/distributed/fleet_executor/dist_model.cc b/paddle/fluid/distributed/fleet_executor/dist_model.cc
index 4328941d60a65..4b59290c2b87a 100644
--- a/paddle/fluid/distributed/fleet_executor/dist_model.cc
+++ b/paddle/fluid/distributed/fleet_executor/dist_model.cc
@@ -76,7 +76,8 @@ bool LoadDataFromDistModelTensor(const DistModelTensor &input_data,
                 input_data.data.length());
   } else if (platform::is_gpu_place(place)) {
     VLOG(3) << "Loading data for GPU.";
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto *dev_ctx = dynamic_cast<const phi::GPUContext *>(pool.Get(place));
     auto gpu_place = place;
diff --git a/paddle/fluid/eager/nan_inf_utils.cc b/paddle/fluid/eager/nan_inf_utils.cc
index e3e5968426462..03d0bfbf5ed23 100644
--- a/paddle/fluid/eager/nan_inf_utils.cc
+++ b/paddle/fluid/eager/nan_inf_utils.cc
@@ -98,7 +98,8 @@ void CheckTensorHasNanOrInf(const std::string& api_name, const Tensor& tensor) {
 
     auto& place = dense_tensor->place();
     if (paddle::platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       paddle::framework::details::tensor_check<phi::GPUContext>(
           api_name, tensor_name, *dense_tensor, place);
 #else
diff --git a/paddle/fluid/framework/copy_same_tensor_test.cc b/paddle/fluid/framework/copy_same_tensor_test.cc
index 3bc05d706ade9..b2f389bb965a0 100644
--- a/paddle/fluid/framework/copy_same_tensor_test.cc
+++ b/paddle/fluid/framework/copy_same_tensor_test.cc
@@ -32,7 +32,8 @@ namespace framework {
 static std::vector<platform::Place> CreatePlaceList() {
   std::vector<platform::Place> places;
   places.emplace_back(platform::CPUPlace());
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   places.emplace_back(platform::CUDAPlace(0));
 #endif
   return places;
diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index 64f6214fca0c9..44cfbf77ea6c2 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -123,7 +123,8 @@ static void RunKernelFunc(
                 "Input tensor (%s) is not initialized.", in_name));
         paddle::Tensor custom_in;
         custom_in.set_impl(std::make_shared<phi::DenseTensor>(*x));
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
         if (custom_in.is_gpu_pinned()) {
           VLOG(3) << "Custom Operator: custom input is gpu pinned tensor";
           auto gpu_place = phi::GPUPlace(platform::GetCurrentDeviceId());
@@ -1174,7 +1175,8 @@ static void RegisterOperatorKernel(
   }
   RegisterOperatorKernelWithPlace(
       name, op_kernel_func, proto::VarType::RAW, platform::CPUPlace());
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   RegisterOperatorKernelWithPlace(
       name, op_kernel_func, proto::VarType::RAW, platform::CUDAPlace());
 #endif
diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc
index d99e7739e8e39..bd03b7cf4c59c 100644
--- a/paddle/fluid/framework/data_feed.cc
+++ b/paddle/fluid/framework/data_feed.cc
@@ -1526,7 +1526,9 @@ void MultiSlotInMemoryDataFeed::PutToFeedVec(
 #endif
 }
 
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && !defined(_WIN32)
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+     defined(PADDLE_WITH_MUSA)) &&                            \
+    !defined(_WIN32)
 template <typename T>
 void PrivateInstantDataFeed<T>::PutToFeedVec() {
   for (size_t i = 0; i < use_slots_.size(); ++i) {
diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h
index 875b8ca13da83..b3ba9b7fd4fdd 100644
--- a/paddle/fluid/framework/data_feed.h
+++ b/paddle/fluid/framework/data_feed.h
@@ -1951,7 +1951,9 @@ class PaddleBoxDataFeed : public MultiSlotInMemoryDataFeed {
   int pv_batch_size_;
 };
 
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && !defined(_WIN32)
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+     defined(PADDLE_WITH_MUSA)) &&                            \
+    !defined(_WIN32)
 template <typename T>
 class PrivateInstantDataFeed : public DataFeed {
  public:
diff --git a/paddle/fluid/framework/data_feed_factory.cc b/paddle/fluid/framework/data_feed_factory.cc
index 368807f72dfc4..887de75181709 100644
--- a/paddle/fluid/framework/data_feed_factory.cc
+++ b/paddle/fluid/framework/data_feed_factory.cc
@@ -70,7 +70,9 @@ REGISTER_DATAFEED_CLASS(MultiSlotDataFeed);
 REGISTER_DATAFEED_CLASS(MultiSlotInMemoryDataFeed);
 REGISTER_DATAFEED_CLASS(PaddleBoxDataFeed);
 REGISTER_DATAFEED_CLASS(SlotRecordInMemoryDataFeed);
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && !defined(_WIN32)
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+     defined(PADDLE_WITH_MUSA)) &&                            \
+    !defined(_WIN32)
 REGISTER_DATAFEED_CLASS(MultiSlotFileInstantDataFeed);
 #endif
 }  // namespace framework
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 88c58c24b804f..9f146d960b026 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -205,63 +205,91 @@ elseif(WITH_ROCM)
 elseif(WITH_MUSA)
   musa_library(
     nan_inf_utils
-    SRCS nan_inf_utils_detail.cc
-    DEPS framework_proto scope place phi)
+    SRCS
+    nan_inf_utils_detail.cc
+    DEPS
+    framework_proto
+    scope
+    place
+    phi)
   musa_library(
     all_reduce_op_handle
-    SRCS all_reduce_op_handle.cc
-    DEPS op_handle_base
-         scope
-         lod_tensor
-         phi
-         memory
-         dynload_cuda
-         variable_visitor)
+    SRCS
+    all_reduce_op_handle.cc
+    DEPS
+    op_handle_base
+    scope
+    lod_tensor
+    phi
+    memory
+    dynload_cuda
+    variable_visitor)
   musa_library(
     fused_all_reduce_op_handle
-    SRCS fused_all_reduce_op_handle.cc
-    DEPS all_reduce_op_handle
-         op_handle_base
-         variable_visitor
-         scope
-         lod_tensor
-         phi
-         memory
-         dynload_cuda
-         place)
+    SRCS
+    fused_all_reduce_op_handle.cc
+    DEPS
+    all_reduce_op_handle
+    op_handle_base
+    variable_visitor
+    scope
+    lod_tensor
+    phi
+    memory
+    dynload_cuda
+    place)
   musa_library(
     grad_merge_all_reduce_op_handle
-    SRCS grad_merge_all_reduce_op_handle.cc
-    DEPS fused_all_reduce_op_handle
-         op_handle_base
-         scope
-         lod_tensor
-         phi
-         memory
-         dynload_cuda
-         variable_visitor
-         place
-         all_reduce_op_handle)
+    SRCS
+    grad_merge_all_reduce_op_handle.cc
+    DEPS
+    fused_all_reduce_op_handle
+    op_handle_base
+    scope
+    lod_tensor
+    phi
+    memory
+    dynload_cuda
+    variable_visitor
+    place
+    all_reduce_op_handle)
 
   if(WITH_DISTRIBUTE)
     musa_library(
       reduce_op_handle
-      SRCS reduce_op_handle.cc
-      DEPS op_handle_base variable_visitor scope phi dynload_cuda)
+      SRCS
+      reduce_op_handle.cc
+      DEPS
+      op_handle_base
+      variable_visitor
+      scope
+      phi
+      dynload_cuda)
   else()
     musa_library(
       reduce_op_handle
-      SRCS reduce_op_handle.cc
-      DEPS op_handle_base variable_visitor scope phi dynload_cuda)
+      SRCS
+      reduce_op_handle.cc
+      DEPS
+      op_handle_base
+      variable_visitor
+      scope
+      phi
+      dynload_cuda)
   endif()
   musa_library(
     broadcast_op_handle
-    SRCS broadcast_op_handle.cc
-    DEPS op_handle_base scope phi memory variable_visitor dynload_cuda)
-  musa_library(
-    fused_broadcast_op_handle
-    SRCS fused_broadcast_op_handle.cc
-    DEPS broadcast_op_handle)
+    SRCS
+    broadcast_op_handle.cc
+    DEPS
+    op_handle_base
+    scope
+    phi
+    memory
+    variable_visitor
+    dynload_cuda)
+  musa_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS
+               broadcast_op_handle)
 else()
   cc_library(
     nan_inf_utils
@@ -446,7 +474,9 @@ endif()
 
 if(NOT APPLE
    AND NOT WIN32
-   AND (WITH_GPU OR WITH_ROCM OR WITH_MUSA))
+   AND (WITH_GPU
+        OR WITH_ROCM
+        OR WITH_MUSA))
   set(IR_PASS_DEPS ${IR_PASS_DEPS} fusion_group_pass)
 endif()
 cc_library(
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index dc66ca6922e35..73707458c073a 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -186,7 +186,8 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
                         "fuse_relu_depthwise_conv_pass");
     AppendPassWithCheck(strategy_.fuse_bn_act_ops_, "fuse_bn_act_pass");
     AppendPassWithCheck(strategy_.fuse_bn_add_act_ops_, "fuse_bn_add_act_pass");
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && \
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+     defined(PADDLE_WITH_MUSA)) &&                            \
     !defined(_WIN32) && !defined(__APPLE__)
     AppendPassWithCheck(strategy_.enable_auto_fusion_, "fusion_group_pass");
 #endif
@@ -545,7 +546,8 @@ USE_PASS(fused_feedforward_pass);
 #ifdef PADDLE_WITH_MKLDNN
 USE_PASS(mkldnn_placement_pass);
 #endif
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && \
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+     defined(PADDLE_WITH_MUSA)) &&                            \
     !defined(_WIN32) && !defined(__APPLE__)
 USE_PASS(fusion_group_pass);
 #endif
diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
index 9956a2301cebf..2c4b73d73b56d 100644
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
@@ -16,7 +16,8 @@
 
 #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
 #include <algorithm>
@@ -44,7 +45,8 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
       place_(place),
       var_infos_(vars.begin(), vars.end()),
       gc_(gc) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (platform::is_gpu_place(place)) {
     dev_ctx_ = reinterpret_cast<phi::GPUContext *>(
         platform::DeviceContextPool::Instance().Get(place));
@@ -78,7 +80,8 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
 }
 
 EagerDeletionOpHandle::~EagerDeletionOpHandle() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (event_) {
     auto gpu_place = dev_ctx_->GetPlace();
     platform::CUDADeviceGuard guard(gpu_place.device);
@@ -94,7 +97,8 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() {
 }
 
 void EagerDeletionOpHandle::InitCUDA() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   int dev_id = dev_ctxes_.begin()->first.device;
   events_[dev_id] = nullptr;
 #endif
@@ -182,7 +186,8 @@ void EagerDeletionOpHandle::RunImpl() {
 
 void EagerDeletionOpHandle::ClearGarbages(
     std::deque<std::shared_ptr<memory::Allocation>> *garbages) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (event_) {
     auto compute_stream = dev_ctx_->stream();
     auto callback_stream =
@@ -206,7 +211,8 @@ void EagerDeletionOpHandle::ClearGarbages(
   } else {
 #endif
     gc_->Add(std::move(*garbages));
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   }
 #endif
 }
diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.h b/paddle/fluid/framework/details/eager_deletion_op_handle.h
index 049b0c2ec478b..e08267938b822 100644
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.h
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.h
@@ -80,7 +80,8 @@ class EagerDeletionOpHandle : public OpHandleBase {
   std::vector<ir::MemOptVarInfo *> var_infos_;  // not own
   GarbageCollector *gc_;                        // not own
   std::vector<Variable *> vars_;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   phi::GPUContext *dev_ctx_{nullptr};
   gpuEvent_t event_{nullptr};
 #endif
diff --git a/paddle/fluid/framework/details/fetch_async_op_handle.cc b/paddle/fluid/framework/details/fetch_async_op_handle.cc
index d96ab68ec823c..f14bb44a76cd4 100644
--- a/paddle/fluid/framework/details/fetch_async_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_async_op_handle.cc
@@ -135,7 +135,8 @@ static void TransData(const phi::DenseTensor *src_item,
                       const platform::DeviceContext &ctx) {
   if (src_item->IsInitialized() && src_item->numel() > 0) {
     if (platform::is_gpu_place(src_item->place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       TensorCopy(*src_item, platform::CUDAPinnedPlace(), ctx, dst_item);
 #endif
     } else {
diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
index 9ea280a8d8bc5..d522791c13875 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -121,7 +121,8 @@ static void TransData(const phi::DenseTensor &src_item,
                       phi::DenseTensor *dst_item) {
   if (src_item.IsInitialized() && src_item.numel() > 0) {
     if (platform::is_gpu_place(src_item.place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       TensorCopy(src_item, platform::CPUPlace(), dst_item);
 #endif
     } else {
diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
index 9d91b8f7fa416..f3ad442609171 100644
--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
@@ -32,7 +32,8 @@ typedef std::vector<
     std::vector<std::pair<std::string, const phi::DenseTensor *>>>
     GradientAndLoDTensor;
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(PADDLE_WITH_MCCL)
 FusedAllReduceOpHandle::FusedAllReduceOpHandle(
     ir::Node *node,
     const std::vector<Scope *> &local_scopes,
@@ -61,7 +62,8 @@ FusedAllReduceOpHandle::FusedAllReduceOpHandle(
 #endif
 
 FusedAllReduceOpHandle::~FusedAllReduceOpHandle() {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(PADDLE_WITH_MCCL)
   auto destroy_event = [](gpuEvent_t event) {
     if (event == nullptr) return;
 #ifdef PADDLE_WITH_HIP
@@ -194,7 +196,8 @@ void FusedAllReduceOpHandle::RunImpl() {
     FusedAllReduceFunc(in_var_handles, out_var_handles);
   }
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(PADDLE_WITH_MCCL)
   if (FLAGS_allreduce_record_one_event) {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(end_event_, nccl_stream));
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
index bc8a31a35a95c..fa3e72ab75cd1 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
@@ -183,7 +183,8 @@ void CheckVarHasNanOrInf(const std::string& op_type,
            << ", place:" << tensor->place() << ", numel:" << tensor->numel();
 
   if (platform::is_gpu_place(tensor->place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     tensor_check<phi::GPUContext>(op_type, var_name, *tensor, place);
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index 663f5781ff62a..c880e6abf5b1c 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -31,7 +31,8 @@ std::string OpHandleBase::DebugString() const {
 }
 
 OpHandleBase::~OpHandleBase() PADDLE_MAY_THROW {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   for (auto &ev : events_) {
     if (ev.second) {
 #ifdef PADDLE_WITH_HIP
@@ -47,7 +48,8 @@ OpHandleBase::~OpHandleBase() PADDLE_MAY_THROW {
 }
 
 void OpHandleBase::InitCUDA() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   for (auto &p : dev_ctxes_) {
     int dev_id = p.first.device;
     platform::SetDeviceId(dev_id);
@@ -141,7 +143,8 @@ void OpHandleBase::InitXPU() {
 }
 
 void OpHandleBase::Run(DeviceType use_device) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (events_.empty() && use_device == p::kCUDA && dev_ctxes_.size() > 0) {
     InitCUDA();
   }
@@ -177,7 +180,8 @@ void OpHandleBase::Run(DeviceType use_device) {
 }
 
 void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   PADDLE_ENFORCE_NOT_NULL(
       waited_ctx,
       platform::errors::InvalidArgument("Argument waited_ctx is NULL."));
@@ -228,7 +232,8 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) {
       if (in_var_handle) {
         auto &place = in_var_handle->place();
         if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
           auto stream =
               static_cast<phi::GPUContext *>(dev_ctxes_.at(place))->stream();
 #ifdef PADDLE_WITH_HIP
@@ -258,7 +263,8 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) {
         if (in_var_handle) {
           auto &place = in_var_handle->place();
           if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
             platform::DeviceContextPool &pool =
                 platform::DeviceContextPool::Instance();
             auto stream =
@@ -283,7 +289,8 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
       auto *in_var_handle = dynamic_cast<VarHandle *>(in_var);
       if (in_var_handle) {
         if (platform::is_gpu_place(in_var_handle->place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
           auto stream = static_cast<phi::GPUContext *>(
                             dev_ctxes_.at(in_var_handle->place()))
                             ->stream();
@@ -324,7 +331,8 @@ bool OpHandleBase::NeedWait(VarHandleBase *in_var) {
 
 void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
   callback();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (!events_.empty()) {  // Use event
     for (auto &p : dev_ctxes_) {
       auto dev_id = p.first.device;
@@ -347,7 +355,8 @@ void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
 
 void OpHandleBase::RunAndRecordEvent(platform::Place p,
                                      const std::function<void()> &callback) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (platform::is_cpu_place(p) || events_.empty()) {
     callback();
   } else {
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index 4bd385ff5099c..b9411082e2dce 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -161,7 +161,8 @@ class OpHandleBase {
   // See https://github.com/PaddlePaddle/Paddle/pull/32283
   bool is_variant_scope_ = false;
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   std::unordered_map<int, gpuEvent_t> events_;
 #endif
 
diff --git a/paddle/fluid/framework/details/reduce_op_handle_test.cc b/paddle/fluid/framework/details/reduce_op_handle_test.cc
index 205567a39ecd7..bb9fbd605aeca 100644
--- a/paddle/fluid/framework/details/reduce_op_handle_test.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc
@@ -303,7 +303,8 @@ TEST(ReduceTester, TestCPUReduceTestLodTensor) {
   test_op.InitReduceOp(out_scope_idx);
   test_op.TestReduceLodTensors(out_scope_idx);
 }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
 TEST(ReduceTester, TestGPUReduceTestSelectedRows) {
   TestReduceOpHandle test_op;
diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
index 8b487b5a0bffb..11490d85d183f 100644
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@@ -76,7 +76,8 @@ struct ScaleLossGradFunctor {
           "Please recompile or reinstall Paddle with XPU support."));
 #endif
     } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       OutT cast_coeff = static_cast<OutT>(coeff_);
       auto stream = static_cast<phi::GPUContext *>(ctx_)->stream();
       memory::Copy(place_,
@@ -110,7 +111,8 @@ void ScaleLossGradOpHandle::RunOnVar(Variable *var, bool record_event) {
   auto *tensor = var->GetMutable<phi::DenseTensor>();
   tensor->Resize(phi::make_ddim({1}));
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   ScaleLossGradFunctor func(
       coeff_, tensor, place_, out_dtype_, this->dev_ctxes_.at(place_));
   if (record_event) {
diff --git a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
index cb16915316ecf..dce9d9ab621bb 100644
--- a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
+++ b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
@@ -95,7 +95,8 @@ void ShareTensorBufferOpHandle::SetShareDimsAndDtype(
 }
 
 void ShareTensorBufferOpHandle::InitCUDA() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   int dev_id = dev_ctxes_.begin()->first.device;
   events_[dev_id] = nullptr;
 #endif
diff --git a/paddle/fluid/framework/details/var_handle.h b/paddle/fluid/framework/details/var_handle.h
index 9a130bea0d3a2..c78267882aaaf 100644
--- a/paddle/fluid/framework/details/var_handle.h
+++ b/paddle/fluid/framework/details/var_handle.h
@@ -129,7 +129,8 @@ struct VarHandle : public VarHandleBase {
         name_(std::move(name)),
         place_(std::move(place)) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   bool HasEvent() { return has_event_; }
 
   const gpuEvent_t& GetEvent() {
@@ -154,7 +155,8 @@ struct VarHandle : public VarHandleBase {
   size_t scope_idx_;
   std::string name_;
   platform::Place place_;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   // Only when this event is triggered, var is generated.
   gpuEvent_t event_;
   bool has_event_{false};
diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h
index 1da0aae399c37..3b3a51b234de4 100644
--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@@ -84,12 +84,13 @@ class PullDenseWorker {
  public:
   virtual ~PullDenseWorker() {}
   virtual void Initialize(const TrainerDesc& param);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   void AddStream(const gpuStream_t stream) { copy_streams_.push_back(stream); }
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
-    defined(PADDLE_WITH_XPU)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_XPU)
   void AddPlace(const paddle::platform::Place place) {
     places_.push_back(place);
   }
@@ -154,7 +155,8 @@ class PullDenseWorker {
   float total_batch_num_ = 0;
   std::unordered_map<const Scope*, int> scope_to_thread_id_;
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   std::vector<gpuStream_t> copy_streams_;
 #endif
   std::vector<paddle::platform::Place> places_;
@@ -185,7 +187,8 @@ class DeviceWorker {
   virtual void ProduceTasks() {}
   virtual void GetXpuOpIndex() {}
   virtual void Schedule(int taskid UNUSED) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   virtual void SetStream(const gpuStream_t stream UNUSED) {}
   virtual void SetEvent(const gpuEvent_t event UNUSED) {}
 #endif
@@ -561,7 +564,8 @@ class PSGPUWorker : public HogwildWorker {
     new (&program_) ProgramDesc(main_program);
   }
   void ProduceTasks() override;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   virtual void SetStream(const gpuStream_t stream) { copy_stream_ = stream; }
   virtual void SetEvent(const gpuEvent_t event) { event_ = event; }
 #endif
@@ -629,7 +633,8 @@ class PSGPUWorker : public HogwildWorker {
   std::unordered_map<uint64_t, std::unordered_set<uint64_t>> feasign_set_;
   paddle::framework::Channel<std::shared_ptr<HeterTask>> pull_queue_;
   paddle::framework::Channel<std::shared_ptr<HeterTask>> push_queue_;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   gpuEvent_t event_;
   gpuStream_t copy_stream_;
 #endif
@@ -802,7 +807,8 @@ class HeterSectionWorker : public DeviceWorker {
   Scope* GetThreadScope() override { return minibatch_scope_; }
 
   // multi-stream
-  // #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+  // #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) ||
+  // defined(PADDLE_WITH_MUSA)
   //  void SetStream(const gpuStream_t stream) override {}
   //  void SetEvent(const gpuEvent_t event) override {}
   // #endif
diff --git a/paddle/fluid/framework/dlpack_tensor_test.cc b/paddle/fluid/framework/dlpack_tensor_test.cc
index 7bf07aac14127..af3368de4dc8e 100644
--- a/paddle/fluid/framework/dlpack_tensor_test.cc
+++ b/paddle/fluid/framework/dlpack_tensor_test.cc
@@ -108,7 +108,8 @@ void TestToDLManagedTensor(const platform::Place &place, uint16_t lanes) {
 
 template <typename T>
 void TestMainLoop() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   std::vector<platform::Place> places{platform::CPUPlace(),
                                       platform::CUDAPlace(0),
                                       platform::CUDAPinnedPlace()};
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 40606c4911649..b3f8525998257 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -492,7 +492,8 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
   std::unique_ptr<GarbageCollector> gc;
   if (!ctx->force_disable_gc_ && max_memory_size >= 0) {
     if (platform::is_gpu_place(place_)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       if (IsFastEagerDeletionModeEnabled()) {
         gc.reset(new UnsafeFastGPUGarbageCollector(place_, max_memory_size));
       } else {
diff --git a/paddle/fluid/framework/fleet/box_wrapper_impl.h b/paddle/fluid/framework/fleet/box_wrapper_impl.h
index 7b97fcbf17ec9..cba6da070ac55 100644
--- a/paddle/fluid/framework/fleet/box_wrapper_impl.h
+++ b/paddle/fluid/framework/fleet/box_wrapper_impl.h
@@ -44,7 +44,9 @@ void BoxWrapper::PullSparseCase(const paddle::platform::Place& place,
     PADDLE_THROW(platform::errors::Unimplemented(
         "Warning:: CPUPlace is not supported in PaddleBox now."));
   } else if (platform::is_gpu_place(place)) {
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && !defined(_WIN32)
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+     defined(PADDLE_WITH_MUSA)) &&                            \
+    !defined(_WIN32)
     VLOG(3) << "Begin copy keys, key_num[" << total_length << "]";
     int device_id = place.GetDeviceId();
     phi::DenseTensor& total_keys_tensor = keys_tensor[device_id];
@@ -162,7 +164,9 @@ void BoxWrapper::PushSparseGradCase(
     PADDLE_THROW(platform::errors::Unimplemented(
         "Warning:: CPUPlace is not supported in PaddleBox now."));
   } else if (platform::is_gpu_place(place)) {
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && !defined(_WIN32)
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+     defined(PADDLE_WITH_MUSA)) &&                            \
+    !defined(_WIN32)
     int device_id = place.GetDeviceId();
     phi::DenseTensor& cached_total_keys_tensor = keys_tensor[device_id];
     uint64_t* total_keys =
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc
index 7ac9e4f7302a6..75adf94e1ce61 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.cc
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc
@@ -784,7 +784,8 @@ void FleetWrapper::PushDenseVarsSync(
     const uint64_t table_id,
     const std::vector<std::string>& var_names) {}
 
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && \
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+     defined(PADDLE_WITH_MUSA)) &&                            \
     (defined PADDLE_WITH_PSLIB)
 void FleetWrapper::PushDenseVarsAsync(
     const Scope& scope,
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h
index 1284b379c9f20..7fa90285e4fb3 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.h
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.h
@@ -175,7 +175,8 @@ class FleetWrapper {
 // Push dense variables to server in async mode
 // Param<in>: scope, table_id, var_names, scale_datanorm, batch_size
 // Param<out>: push_sparse_status
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   void PushDenseVarsAsync(
       const Scope& scope,
       const uint64_t table_id,
diff --git a/paddle/fluid/framework/fleet/heter_wrapper.cc b/paddle/fluid/framework/fleet/heter_wrapper.cc
index 761ef1cf8051a..b00b25b4eab8d 100644
--- a/paddle/fluid/framework/fleet/heter_wrapper.cc
+++ b/paddle/fluid/framework/fleet/heter_wrapper.cc
@@ -121,7 +121,8 @@ void HeterWrapper::SerializeToReq(const std::string& varname,
            tensor->numel() *
                SizeOfType(framework::TransToProtoVarType(tensor->dtype())));
   } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     memory::Copy(platform::CPUPlace(),
                  data_ptr,
                  tensor->place(),
@@ -141,7 +142,8 @@ void HeterWrapper::SerializeToReq(const std::string& varname,
   }
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 void HeterWrapper::DeSerializeToTensor(Scope* scope,
                                        const VariableMessage& req_var,
                                        platform::Place place,
@@ -169,7 +171,8 @@ void HeterWrapper::DeSerializeToTensor(Scope* scope,
   void* tensor_data = tensor->mutable_data(
       place, framework::TransToPhiDataType(ToVarType(req_var.data_type())));
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   memory::Copy(place,
                tensor_data,
                platform::CPUPlace(),
diff --git a/paddle/fluid/framework/fleet/heter_wrapper.h b/paddle/fluid/framework/fleet/heter_wrapper.h
index 70cbce2acc24d..ec4bc3a984c2c 100644
--- a/paddle/fluid/framework/fleet/heter_wrapper.h
+++ b/paddle/fluid/framework/fleet/heter_wrapper.h
@@ -92,7 +92,8 @@ class HeterWrapper {
 
   framework::proto::VarType::Type ToVarType(VariableMessage::Type type);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   void DeSerializeToTensor(Scope* scope,
                            const VariableMessage& req_var,
                            platform::Place place,
diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc
index b39c11f3f4106..f49ca915fb674 100644
--- a/paddle/fluid/framework/garbage_collector.cc
+++ b/paddle/fluid/framework/garbage_collector.cc
@@ -13,7 +13,8 @@
 // limitations under the License.
 
 #include <functional>
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
 #include "gflags/gflags.h"
@@ -64,7 +65,8 @@ void IPUGarbageCollector::ClearCallback(const std::function<void()> &callback) {
 }
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 UnsafeFastGPUGarbageCollector::UnsafeFastGPUGarbageCollector(
     const platform::CUDAPlace &place, size_t max_memory_size)
     : GarbageCollector(place, max_memory_size) {}
diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h
index 9727654d04c84..5150c91d0af0c 100644
--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@@ -85,7 +85,8 @@ class IPUGarbageCollector : public GarbageCollector {
 };
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 class UnsafeFastGPUGarbageCollector : public GarbageCollector {
  public:
   UnsafeFastGPUGarbageCollector(const platform::CUDAPlace &place,
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 6f1075c3bf16d..e80a292d17f92 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -3,7 +3,9 @@ add_subdirectory(memory_optimize_pass)
 add_subdirectory(multi_devices_graph_pass)
 if(NOT APPLE
    AND NOT WIN32
-   AND (WITH_GPU OR WITH_ROCM OR WITH_MUSA))
+   AND (WITH_GPU
+        OR WITH_ROCM
+        OR WITH_MUSA))
   add_subdirectory(fusion_group)
 endif()
 
@@ -159,7 +161,9 @@ if(WITH_TENSORRT)
   pass_library(split_layernorm_to_math_ops_pass inference)
 endif()
 
-if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
+if(WITH_GPU
+   OR WITH_ROCM
+   OR WITH_MUSA)
   pass_library(cudnn_placement_pass base DEPS placement_pass_base)
   pass_library(embedding_eltwise_layernorm_fuse_pass inference)
 endif()
diff --git a/paddle/fluid/framework/ir/cost_model.cc b/paddle/fluid/framework/ir/cost_model.cc
index 9ac931f2501a7..49b96836cfbbf 100644
--- a/paddle/fluid/framework/ir/cost_model.cc
+++ b/paddle/fluid/framework/ir/cost_model.cc
@@ -128,7 +128,8 @@ bool CostData::SetCostData(const ProgramDesc& program,
     double cpu_time_ms = main_thread_events[op_push_index].CpuElapsedMs(
         main_thread_events[op_pop_index]);
     double gpu_time_ms = 0;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     gpu_time_ms = main_thread_events[op_push_index].CudaElapsedMs(
         main_thread_events[op_pop_index]);
 #endif
@@ -152,7 +153,8 @@ bool CostData::SetCostData(const ProgramDesc& program,
     double cpu_time_ms = main_thread_events[start_profiler_idx].CpuElapsedMs(
         main_thread_events[stop_profiler_idx]);
     double gpu_time_ms = 0;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     gpu_time_ms = main_thread_events[start_profiler_idx].CudaElapsedMs(
         main_thread_events[stop_profiler_idx]);
 #endif
diff --git a/paddle/fluid/framework/ir/fuse_bn_act_pass.cc b/paddle/fluid/framework/ir/fuse_bn_act_pass.cc
index 87cc35b2c3b5f..951c861bb7a4b 100644
--- a/paddle/fluid/framework/ir/fuse_bn_act_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_bn_act_pass.cc
@@ -34,8 +34,10 @@ namespace framework {
 namespace ir {
 
 void FuseBatchNormActPass::ApplyImpl(ir::Graph *graph) const {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
-#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || CUDNN_VERSION_MIN(7, 4, 1)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
+    CUDNN_VERSION_MIN(7, 4, 1)
   // forward
   std::unordered_set<std::string> act_types = {"relu"};
   graph = FuseBatchNormAct(graph, act_types);
diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
index 7ffd09d2474df..9cffdaed6a59e 100644
--- a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
+++ b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
@@ -27,7 +27,8 @@ namespace phi {
 class DenseTensor;
 }  // namespace phi
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
index e70d6fabd5c05..16398806597e3 100644
--- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
@@ -641,7 +641,8 @@ void BuildOpFuncList(const platform::Place& place,
             *op_with_kernel, *runtime_scope, *dev_ctx, runtime_context);
         auto expected_kernel_key = framework::TransPhiKernelKeyToOpKernelType(
             op_with_kernel->GetExpectedKernelType(exec_ctx));
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
         if (op_with_kernel->CanCUDNNBeUsed(exec_ctx,
                                            expected_kernel_key.data_type_)) {
           expected_kernel_key.library_type_ = framework::LibraryType::kCUDNN;
diff --git a/paddle/fluid/framework/new_executor/interpreter_base_impl.h b/paddle/fluid/framework/new_executor/interpreter_base_impl.h
index 66a41274cd105..6680af7eb3206 100644
--- a/paddle/fluid/framework/new_executor/interpreter_base_impl.h
+++ b/paddle/fluid/framework/new_executor/interpreter_base_impl.h
@@ -48,7 +48,8 @@ DECLARE_bool(benchmark);
 DECLARE_uint64(executor_log_deps_every_microseconds);
 PHI_DECLARE_bool(new_executor_use_cuda_graph);
 PHI_DECLARE_bool(enable_new_ir_in_executor);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PHI_DECLARE_bool(sync_nccl_allreduce);
 #endif
 
diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
index eae90f2a29739..9382d7a4bd090 100644
--- a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
@@ -892,7 +892,8 @@ void NewIRInterpreter::RunOperator(const Instruction& instr_node) {
   /*For profiling/benchmark only*/
   if (FLAGS_benchmark) {
     instr_node.DeviceContext().Wait();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
     VLOG(4) << "Operator(" << op->Type()
             << "): context wait and get last error";
@@ -1245,7 +1246,8 @@ void NewIRInterpreter::RecordStreamForGC(const Instruction& instr) {
 void NewIRInterpreter::CheckGC(const Instruction& instr) {
   platform::RecordEvent record(
       "CheckGC", platform::TracerEventType::UserDefined, 10);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   RecordStreamForGC(instr);
 #endif
   auto& var_scope = var_scope_;
diff --git a/paddle/fluid/framework/new_executor/profiler.h b/paddle/fluid/framework/new_executor/profiler.h
index f2fa9fd50eedb..d72f201a9e02b 100644
--- a/paddle/fluid/framework/new_executor/profiler.h
+++ b/paddle/fluid/framework/new_executor/profiler.h
@@ -42,7 +42,8 @@ class ProfilerGuard {
  private:
   void TotalCUDAAllocatedMemorySize(const platform::Place& place) {
     if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       auto cuda_place = place;
       cost_info_->device_memory_bytes =
           platform::RecordedGpuMallocSize(cuda_place.device);
diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc
index 04cbca42c152a..d14bc40d32217 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/program_interpreter.cc
@@ -880,7 +880,8 @@ void ProgramInterpreter::RunOperator(const Instruction& instr_node) {
   /*For profiling/benchmark only*/
   if (FLAGS_benchmark) {
     instr_node.DeviceContext().Wait();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
     VLOG(4) << "Operator(" << op->Type()
             << "): context wait and get last error";
@@ -1232,7 +1233,8 @@ void ProgramInterpreter::RecordStreamForGC(const Instruction& instr) {
 void ProgramInterpreter::CheckGC(const Instruction& instr) {
   platform::RecordEvent record(
       "CheckGC", platform::TracerEventType::UserDefined, 10);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   RecordStreamForGC(instr);
 #endif
   auto& var_scope = var_scope_;
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index db04262563c4d..ed536dbd56250 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -58,7 +58,8 @@ class DenseTensor;
 #include "paddle/fluid/platform/mkldnn_op_list.h"
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #endif
 
@@ -1516,7 +1517,8 @@ bool OperatorWithKernel::SupportsKernelType(
   }
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (this->CanCUDNNBeUsed(exe_ctx, kernel_type.data_type_)) {
     auto tmp_kernel_type = kernel_type;
     tmp_kernel_type.library_type_ = framework::LibraryType::kCUDNN;
@@ -1544,7 +1546,8 @@ bool OperatorWithKernel::CanCUDNNBeUsed(const framework::ExecutionContext& ctx,
   bool use_cudnn = ctx.HasAttr("use_cudnn") && ctx.Attr<bool>("use_cudnn") &&
                    paddle::platform::is_gpu_place(ctx.GetPlace());
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (use_cudnn) {
     auto& dev_ctx = ctx.device_context<phi::GPUContext>();
     use_cudnn &= (dev_ctx.cudnn_handle() != nullptr);
@@ -1783,7 +1786,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
       }
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       if (this->CanCUDNNBeUsed(exe_ctx, kernel_type_->data_type_)) {
         kernel_type_->library_type_ = framework::LibraryType::kCUDNN;
       }
@@ -2109,7 +2113,8 @@ OpKernelType OperatorWithKernel::InnerGetExpectedKernelType(
   }
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (this->CanCUDNNBeUsed(ctx, expected_kernel_key.data_type_)) {
     expected_kernel_key.library_type_ = framework::LibraryType::kCUDNN;
   }
@@ -2132,7 +2137,8 @@ OpKernelType OperatorWithKernel::InnerGetExpectedKernelType(
       // CPUKernel will be executed and a warning will be given at the same
       // time.
       expected_kernel_key.place_ = platform::CPUPlace();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       if (SupportGPU()) {
         auto& dev_ctx = ctx.device_context();
         expected_kernel_key.place_ = dev_ctx.GetPlace();
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 68df442f4a5fa..50802e83fd7fa 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -575,7 +575,8 @@ class ExecutionContext : public phi::KernelContext {
     return device_context_;
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   const inline phi::GPUContext& cuda_device_context() const {
     PADDLE_ENFORCE_EQ(platform::is_gpu_place(device_context_.GetPlace()),
                       true,
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index b827c87aae5cd..b85a7bb0fa381 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -41,14 +41,16 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
 #include "paddle/fluid/platform/flags.h"
 
 PHI_DECLARE_double(eager_delete_tensor_gb);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PHI_DECLARE_bool(sync_nccl_allreduce);
 #endif
 
@@ -69,7 +71,8 @@ static std::once_flag gProfileOnce;
 static bool gProfileStarted = false;
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 std::once_flag p2p_init_flag;
 #endif
 
@@ -512,7 +515,8 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
     }
     std::unique_ptr<GarbageCollector> gc;
     if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       if (IsFastEagerDeletionModeEnabled()) {
         gc.reset(new UnsafeFastGPUGarbageCollector(place, max_memory_size));
       } else {
@@ -621,7 +625,8 @@ bool ParallelExecutor::NeedCreateLocalExeScope() {
 }
 
 void InitP2P(const std::vector<platform::Place> &places) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   std::call_once(p2p_init_flag, [&]() {
     int count = places.size();
     if (count <= 1) return;
@@ -1305,7 +1310,9 @@ void ParallelExecutor::InitExecutorPrivateMemberInfo(
         BuildStrategy::ReduceStrategy::kAllReduce;
     member_->use_all_reduce_ = true;
   }
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && defined(_WIN32)
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+     defined(PADDLE_WITH_MUSA)) &&                            \
+    defined(_WIN32)
   if (member_->IsUseCUDA(member_->use_device_)) {
     PADDLE_ENFORCE_EQ(
         device_count,
@@ -1314,7 +1321,8 @@ void ParallelExecutor::InitExecutorPrivateMemberInfo(
   }
 #endif
 
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && \
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+     defined(PADDLE_WITH_MUSA)) &&                            \
     (!defined(PADDLE_WITH_NCCL) && !defined(PADDLE_WITH_RCCL))
   if (member_->IsUseCUDA(member_->use_device_)) {
     PADDLE_ENFORCE_EQ(
@@ -1680,7 +1688,8 @@ std::vector<ir::Graph *> ParallelExecutor::CreateSSAGraphExecutor(
     final_graphs = *async_graphs;
   } else if (member_->build_strategy_.enable_parallel_graph_) {
     VLOG(3) << "use ParallelSSAGraphExecutor";
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     // TODO(Yancey1989): Remove passing in the main_program when
     // allreduce_seq_pass doesn't need it as the attr.
     bool is_inference = details::IsDataParallelInferenceGraph(*graph);
diff --git a/paddle/fluid/framework/pull_dense_worker.cc b/paddle/fluid/framework/pull_dense_worker.cc
index 5cb310fd9a4a1..40296242b1927 100644
--- a/paddle/fluid/framework/pull_dense_worker.cc
+++ b/paddle/fluid/framework/pull_dense_worker.cc
@@ -69,19 +69,20 @@ void PullDenseWorker::Initialize(const TrainerDesc& param) {
   fleet_ptr_ = FleetWrapper::GetInstance();
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   copy_streams_.clear();
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
-    defined(PADDLE_WITH_XPU)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_XPU)
   places_.clear();
   thread_scopes_.clear();
 #endif
 }
 
 void PullDenseWorker::CreatePinVar() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
-    defined(PADDLE_WITH_XPU)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_XPU)
   // for (auto& v : dense_value_names_) {
   //  for (auto& name : v.second) {
   for (int i = 0; i < dwp_param_.program_config(0).pull_dense_table_id_size();
@@ -96,7 +97,8 @@ void PullDenseWorker::CreatePinVar() {
       auto* ptr = root_scope_->Var(name + "pin");
       InitializeVariable(ptr, proto::VarType::LOD_TENSOR);
       phi::DenseTensor* pin_tensor = ptr->GetMutable<phi::DenseTensor>();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       pin_tensor->mutable_data<float>(tensor->dims(),
                                       platform::CUDAPinnedPlace());
 #endif
@@ -125,8 +127,8 @@ void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) {
     exit(-1);
   }
   status_vec->resize(0);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
-    defined(PADDLE_WITH_XPU)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_XPU)
 
   for (size_t i = 0; i < places_.size(); ++i) {
     // for (auto& v : dense_value_names_) {
@@ -144,7 +146,8 @@ void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) {
         Variable* var = thread_scopes_[i]->FindVar(name);
         phi::DenseTensor* tensor = var->GetMutable<phi::DenseTensor>();
         float* w = tensor->data<float>();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
         memory::Copy(places_[i],
                      w,
                      platform::CUDAPinnedPlace(),
@@ -179,8 +182,8 @@ void PullDenseWorker::PullDense(bool force_update) {
     uint64_t tid = static_cast<uint64_t>(
         dwp_param_.program_config(0).pull_dense_table_id(i));
     if (force_update || CheckUpdateParam(tid)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
-    defined(PADDLE_WITH_XPU)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_XPU)
       VLOG(3) << "pull dense " << force_update << " " << tid;
       fleet_ptr_->PullDenseVarsAsync(*root_scope_,
                                      tid,
diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc
index cd436becfbe93..d2f8a9f955608 100644
--- a/paddle/fluid/framework/section_worker.cc
+++ b/paddle/fluid/framework/section_worker.cc
@@ -228,7 +228,8 @@ void SectionWorker::TrainFiles() {
   int64_t max_memory_size = GetEagerDeletionThreshold();
   std::unique_ptr<GarbageCollector> gc;
   if (max_memory_size >= 0) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     if (platform::is_gpu_place(place_)) {
       if (IsFastEagerDeletionModeEnabled()) {
         gc.reset(new UnsafeFastGPUGarbageCollector(place_, max_memory_size));
diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc
index 9b1e8ccf63e87..37378d4d3a161 100644
--- a/paddle/fluid/framework/tensor_test.cc
+++ b/paddle/fluid/framework/tensor_test.cc
@@ -114,7 +114,8 @@ TEST(DenseTensor, MutableData) {
     EXPECT_EQ(static_cast<int>(p2[0]), 1);
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   {
     phi::DenseTensor src_tensor;
     float* p1 = nullptr;
@@ -168,7 +169,8 @@ TEST(DenseTensor, ShareDataWith) {
     ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   {
     phi::DenseTensor src_tensor;
     phi::DenseTensor dst_tensor;
@@ -206,7 +208,8 @@ TEST(DenseTensor, Slice) {
     EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address);
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   {
     phi::DenseTensor src_tensor;
     src_tensor.mutable_data<double>(phi::make_ddim({6, 9}),
@@ -295,7 +298,8 @@ TEST(DenseTensor, Split) {
       EXPECT_EQ(src_data_address + 2 * 2 * i * sizeof(int), split_data_address);
     }
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   {
     phi::DenseTensor src_tensor;
     src_tensor.mutable_data<double>(phi::make_ddim({6, 4}),
@@ -357,7 +361,8 @@ TEST(DenseTensor, Chunk) {
       EXPECT_EQ(src_data_address + 2 * 2 * i * sizeof(int), split_data_address);
     }
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   {
     phi::DenseTensor src_tensor;
     src_tensor.mutable_data<double>(phi::make_ddim({6, 4}),
diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc
index 89c4a764b86f2..638114df3d2da 100644
--- a/paddle/fluid/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
@@ -58,7 +58,8 @@ TEST(TensorCopy, Tensor) {
   }
   EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout());
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   {
     phi::DenseTensor src_tensor;
     phi::DenseTensor gpu_tensor;
@@ -153,7 +154,8 @@ TEST(TensorFromVector, Tensor) {
     delete cpu_place;
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   {
     std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
     phi::DenseTensor cpu_tensor;
@@ -232,7 +234,8 @@ TEST(TensorToVector, Tensor) {
       EXPECT_EQ(src_ptr[i], dst[i]);
     }
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   {
     std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
     phi::DenseTensor gpu_tensor;
@@ -323,7 +326,8 @@ TEST(TensorFromDLPack, Tensor) {
     }
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   {
     std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
     phi::DenseTensor cpu_tensor;
@@ -489,7 +493,8 @@ TEST(Tensor, FromAndToStream) {
     EXPECT_EQ(dst_tensor.dims(), src_tensor.dims());
     delete place;
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   {
     phi::DenseTensor gpu_tensor;
     gpu_tensor.Resize({2, 3});
diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h
index fe0eed9719c5e..b62347b1561bf 100644
--- a/paddle/fluid/framework/var_type_traits.h
+++ b/paddle/fluid/framework/var_type_traits.h
@@ -65,7 +65,8 @@ class SparseCsrTensor;
 namespace paddle {
 
 namespace platform {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 class Communicator;
 class NCCLCommunicator;
@@ -195,8 +196,10 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl<
     FetchList,
     FeedList,
     operators::reader::OrderedMultiDeviceLoDTensorBlockingQueueHolder,
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(PADDLE_WITH_MCCL)
     ncclUniqueId,
     platform::Communicator,
     platform::NCCLCommunicator,
diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc
index 1d424e81ba5ef..da93b60b4a280 100644
--- a/paddle/fluid/imperative/amp_auto_cast.cc
+++ b/paddle/fluid/imperative/amp_auto_cast.cc
@@ -138,7 +138,8 @@ AmpOperators::AmpOperators()
       block_ops_(new std::unordered_set<std::string>()),
       unsupported_fp16_ops_(new std::unordered_set<std::string>()),
       unsupported_bf16_ops_(new std::unordered_set<std::string>()) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   auto unsupported_ops_gpu_fp16 = std::get<2>(
       OpSupportedInfos("GPU", paddle::framework::proto::VarType::FP16));
   unsupported_fp16_ops_->insert(unsupported_ops_gpu_fp16.begin(),
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index 8c78f7af783dd..7199762e0c5ac 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -204,7 +204,8 @@ void TensorAdd(const VarType& src, VarType* dst) {
   }
 
   if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     PADDLE_TENSOR_ADD(float, phi::GPUContext);
     PADDLE_TENSOR_ADD(double, phi::GPUContext);
     PADDLE_TENSOR_ADD(phi::dtype::float16, phi::GPUContext);
@@ -313,7 +314,8 @@ void SelectedRowsAddToTensor(const VarType& src, VarType* dst) {
     return;                                                              \
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (paddle::platform::is_gpu_place(place)) {
     PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::GPUContext, float);
     PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::GPUContext, double);
@@ -321,7 +323,8 @@ void SelectedRowsAddToTensor(const VarType& src, VarType* dst) {
 #endif
     PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::CPUContext, float);
     PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::CPUContext, double);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   }
 #endif
 
@@ -364,7 +367,8 @@ void SelectedRowsAddTensor(const VarType& src_selected_rows_var,
     return;                                                            \
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (platform::is_gpu_place(place)) {
     PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::GPUContext, float);
     PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::GPUContext, double);
@@ -372,7 +376,8 @@ void SelectedRowsAddTensor(const VarType& src_selected_rows_var,
 #endif
     PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::CPUContext, float);
     PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::CPUContext, double);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   }
 #endif
 
@@ -425,7 +430,8 @@ std::shared_ptr<ReturnVarType> SelectedRowsMerge(const VarType& src1,
     return dst_var;                                                  \
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (paddle::platform::is_gpu_place(place)) {
     PADDLE_SELECTED_ROWS_ADD(phi::GPUContext, float);
     PADDLE_SELECTED_ROWS_ADD(phi::GPUContext, double);
@@ -441,7 +447,8 @@ std::shared_ptr<ReturnVarType> SelectedRowsMerge(const VarType& src1,
 #if defined(PADDLE_WITH_XPU)
     }
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   }
 #endif
 
@@ -712,7 +719,8 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr<VariableWrapper> var,
         }
       }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       if (paddle::platform::is_gpu_place(place)) {
         // sum selected rows firstly
         for (auto& var_info : tmp_grad_vars_) {
@@ -778,7 +786,8 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr<VariableWrapper> var,
           // Increase count
           IncreaseCurCnt();
         }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       }
 #endif
       tmp_grad_vars_.clear();
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 6401580096db8..206c3e562e70a 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -205,7 +205,8 @@ PreparedOp PrepareImpl(
   }
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (op.CanCUDNNBeUsed(dygraph_exe_ctx, expected_kernel_key.dtype())) {
     expected_kernel_key.set_backend(phi::Backend::GPUDNN);
   }
@@ -555,7 +556,8 @@ static void PreparedOpRunImpl(
 
   if (FLAGS_benchmark) {
     dev_ctx->Wait();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
     VLOG(4) << "Operator(" << op.Type() << "): context wait and get last error";
 #endif
@@ -645,7 +647,8 @@ static void PreparedOpRunPtImpl(
 
   if (FLAGS_benchmark) {
     dev_ctx->Wait();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
     VLOG(4) << "Operator(" << op.Type() << "): context wait and get last error";
 #endif
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index f7b67e027fb7b..2c0669aa12883 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -106,7 +106,8 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
   if (gcs_.count(place) == 0) {
     std::unique_ptr<framework::GarbageCollector> gc;
     if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       gc.reset(new framework::DefaultStreamGarbageCollector(place, 0));
 
       VLOG(10) << "Created GarbageCollector at " << place;
@@ -116,7 +117,8 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
           "Please recompile or reinstall Paddle with GPU support."));
 #endif
     } else if (platform::is_cuda_pinned_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       gc.reset(new framework::CUDAPinnedGarbageCollector(place, 0));
 
       VLOG(10) << "Created GarbageCollector at " << place;
@@ -274,7 +276,8 @@ void Tracer::TraceOpImpl(const std::string& type,
 
   try {
     if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       platform::SetDeviceId(place.device);
 #else
       PADDLE_THROW(platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
index 2580a2aa8ec2a..4777082196771 100644
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -36,7 +36,8 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) {
   // The parameters are on the cpu, therefore, synchronization is not necessary.
   if (!argument->use_gpu()) return;
@@ -209,7 +210,8 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
       argument->scope_valid(),
       true,
       platform::errors::PreconditionNotMet("The scope field should be valid"));
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (argument->use_gpu_valid()) {
     CopyParamsToGpu(argument);
   }
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
index 6ab7d83b8922d..86f8a12539809 100644
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
@@ -32,7 +32,8 @@ class IrParamsSyncAmongDevicesPass : public AnalysisPass {
   std::string repr() const override;
 
  private:
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   void CopyParamsToGpu(Argument *argument);
 #endif
 
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 26d4fa79a34e0..d6c535c591cda 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -99,7 +99,8 @@
 
 namespace paddle {
 namespace {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 void UpdatePrivateDeviceContext(InferGPUContext *gpu_context,
                                 GPUContextResource *gpu_resource,
                                 Place place_) {
@@ -272,7 +273,8 @@ bool PaddleTensorToDenseTensor(const PaddleTensor &pt,
                       false,
                       platform::errors::InvalidArgument(
                           "Only one choice can be made between CPU and XPU."));
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto *dev_ctx = static_cast<const phi::GPUContext *>(pool.Get(place));
     auto dst_gpu_place = place;
@@ -372,7 +374,8 @@ bool AnalysisPredictor::Init(
     return true;
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   // TODO(inference): Now only gpu with external stream support private
   // device_context.
   if (config_.use_gpu_ && config_.use_external_stream_) {
@@ -420,7 +423,8 @@ void AnalysisPredictor::InitPlace() {
                       platform::errors::InvalidArgument(
                           "Only one choice can be made between CPU and XPU."));
     place_ = paddle::platform::CUDAPlace(config_.gpu_device_id());
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     if (config_.thread_local_stream_enabled()) {
       LOG_FIRST_N(WARNING, 1) << "We will remove this interface in the future. "
                                  "Please use config.SetExecStream instead.";
@@ -491,14 +495,16 @@ void AnalysisPredictor::InitPlace() {
 }
 
 void AnalysisPredictor::InitResourceManager(void *stream) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   predictor_stream_ =
       ResourceManager::Instance().InitGPUResource(place_, stream);
 #endif
 }
 
 void AnalysisPredictor::InitDeviceContexts() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   // Init GPUContext.
   if (place_.GetType() == phi::AllocationType::GPU) {
     device_contexts_.emplace(
@@ -536,7 +542,8 @@ void AnalysisPredictor::InitDeviceContexts() {
 }
 
 void *AnalysisPredictor::GetExecStream() const {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (place_.GetType() == phi::AllocationType::GPU) {
     if (private_context_) {
       return predictor_stream_;
@@ -2153,7 +2160,8 @@ bool AnalysisPredictor::ZeroCopyRun() {
   return true;
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) {
   if (!private_context_) {
     PADDLE_THROW(platform::errors::Fatal(
@@ -2203,7 +2211,8 @@ void AnalysisPredictor::HookCollectShapeRangeInfo() {
     paddle::platform::DeviceContextPool &pool =
         paddle::platform::DeviceContextPool::Instance();
     if (config_.use_gpu()) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       auto *dev_ctx = pool.Get(place_);
       auto stream = static_cast<phi::GPUContext *>(dev_ctx)->stream();
 #ifdef PADDLE_WITH_HIP
@@ -2601,7 +2610,8 @@ AnalysisPredictor::~AnalysisPredictor() {
   if (config_.shape_range_info_collected()) {
     StatisticShapeRangeInfo();
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (predictor_stream_ != nullptr) {
     ResourceManager::Instance().DestroyGPUResource(predictor_stream_);
   }
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index 36c5d13a84521..547cf0d2284be 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -220,7 +220,8 @@ class AnalysisPredictor : public PaddlePredictor {
   ///
   bool ZeroCopyRun() override;
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   // Note: Can only be used under thread_local semantics.
   bool ExpRunWithExternalStream(const gpuStream_t stream);
 #endif
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
index 77518f9da4557..dd219f2c59fd5 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -108,7 +108,8 @@ T *Tensor::mutable_data(PlaceType place) {
       return tensor->mutable_data<T>(paddle::platform::CPUPlace());
     }
     case static_cast<int>(PlaceType::kGPU): {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       paddle::platform::CUDAPlace gpu_place(device_);
       auto *dev_ctxs = reinterpret_cast<const std::map<
           phi::Place,
@@ -204,7 +205,8 @@ void Tensor::CopyFromCpu(const T *data) {
     auto *t_data = tensor->mutable_data<T>(paddle::platform::CPUPlace());
     std::memcpy(static_cast<void *>(t_data), data, ele_size);
   } else if (place_ == PlaceType::kGPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
     paddle::platform::CUDAPlace gpu_place(device_);
     auto *dev_ctxs = reinterpret_cast<const std::map<
@@ -406,7 +408,8 @@ void Tensor::CopyToCpuImpl(T *data,
         "with IPU."));
 #endif
   } else if (place_ == PlaceType::kGPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     auto gpu_place = t_place;
     auto *dev_ctxs = reinterpret_cast<const std::map<
         phi::Place,
@@ -823,7 +826,8 @@ void InternalUtils::CopyFromCpuWithIoStream(paddle_infer::Tensor *t,
     auto *t_data = tensor->mutable_data<T>(paddle::platform::CPUPlace());
     std::memcpy(static_cast<void *>(t_data), data, ele_size);
   } else if (t->place_ == PlaceType::kGPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     paddle::platform::CUDAPlace gpu_place(t->device_);
     auto *t_data = tensor->mutable_data<T>(gpu_place);
     paddle::memory::Copy(gpu_place,
@@ -893,7 +897,8 @@ void InternalUtils::CopyToCpuWithIoStream(paddle_infer::Tensor *t,
     std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T));
 #endif
   } else if (t->place_ == PlaceType::kGPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     paddle::memory::Copy(paddle::platform::CPUPlace(),
                          static_cast<void *>(data),
                          t_place,
diff --git a/paddle/fluid/inference/api/infer_context.cc b/paddle/fluid/inference/api/infer_context.cc
index 57a7625aaef58..b2abb21602dc9 100644
--- a/paddle/fluid/inference/api/infer_context.cc
+++ b/paddle/fluid/inference/api/infer_context.cc
@@ -21,7 +21,8 @@
 
 namespace paddle {
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 InferGPUContext::InferGPUContext(const phi::Place& place)
     : phi::GPUContext(place, false) {}
 #endif
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index 2effc6428448a..623e66bc137b8 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -43,78 +43,80 @@ file(
 # FIXME(@MTAI): compilation error will occur when compiling the following files.
 # This need to be fixed later.
 if(WITH_MUSA)
-  list(REMOVE_ITEM kernel_cu
-       "fusion/gpu/fused_softmax_mask_grad_kernel.cu"
-       "fusion/gpu/fused_softmax_mask_kernel.cu"
-       "gpu/batch_norm_grad_kernel.cu"
-       "gpu/batch_norm_kernel.cu"
-       "gpu/cholesky_grad_kernel.cu"
-       "gpu/cholesky_solve_grad_kernel.cu"
-       "gpu/conv_grad_kernel.cu"
-       "gpu/conv_kernel.cu"
-       "gpu/cross_entropy_grad_kernel.cu" 
-       "gpu/cross_entropy_kernel.cu"
-       "gpu/conv_transpose_grad_kernel.cu"
-       "gpu/conv_transpose_kernel.cu"
-       "gpu/cudnn_lstm_grad_kernel.cu"
-       "gpu/cudnn_lstm_kernel.cu"
-       "gpu/depthwise_conv_grad_kernel.cu"
-       "gpu/depthwise_conv_kernel.cu"
-       "gpu/dist_kernel.cu"
-       "gpu/elementwise_divide_grad_kernel.cu"
-       "gpu/elementwise_grad_kernel.cu"
-       "gpu/elementwise_multiply_grad_kernel.cu"
-       "gpu/erfinv_kernel.cu"
-       "gpu/exponential_kernel.cu"
-       "gpu/fft_grad_kernel.cu"
-       "gpu/fft_kernel.cu"
-       "gpu/fused_softmax_mask_grad_kernel.cu"
-       "gpu/gaussian_kernel.cu"
-       "gpu/gelu_grad_kernel.cu"
-       "gpu/gelu_kernel.cu"
-       "gpu/histogram_kernel.cu"
-       "gpu/instance_norm_grad_kernel.cu"
-       "gpu/instance_norm_kernel.cu"
-       "gpu/interpolate_grad_kernel.cu"
-       "gpu/kthvalue_grad_kernel.cu"
-       "gpu/kthvalue_kernel.cu"
-       "gpu/layer_norm_grad_kernel.cu"
-       "gpu/layer_norm_kernel.cu"
-       "gpu/llm_int8_mat_mul_kernel.cu"
-       "gpu/log_softmax_grad_kernel.cu"
-       "gpu/log_softmax_kernel.cu"
-       "gpu/lstsq_kernel.cu"
-       "gpu/nanmedian_kernel.cu"
-       "gpu/rnn_grad_kernel.cu.cc"
-       "gpu/rnn_kernel.cu.cc"
-       "gpu/slogdeterminant_grad_kernel.cu"
-       "gpu/softmax_grad_kernel.cu"
-       "gpu/softmax_kernel.cu"
-       "gpu/solve_grad_kernel.cu"
-       "gpu/solve_kernel.cu"
-       "gpu/spectral_norm_grad_kernel.cu"
-       "gpu/spectral_norm_kernel.cu"
-       "gpu/stft_kernel.cu"
-       "gpu/svd_grad_kernel.cu"
-       "gpu/top_k_grad_kernel.cu"
-       "gpu/top_k_kernel.cu"
-       "gpu/truncated_gaussian_random_kernel.cu"
-       "gpudnn/affine_grid_grad_kernel.cu"
-       "gpudnn/affine_grid_kernel.cu"
-       "gpudnn/softmax_grad_kernel.cu"
-       "gpudnn/softmax_kernel.cu"
-       "gpudnn/conv_grad_kernel.cu"
-       "gpudnn/conv_kernel.cu"
-       "gpudnn/conv_transpose_grad_kernel.cu"
-       "gpudnn/conv_transpose_kernel.cu"
-       "gpudnn/pool_grad_kernel.cu"
-       "gpudnn/pool_kernel.cu"
-       "sparse/gpu/softmax_grad_kernel.cu"
-       "sparse/gpu/softmax_kernel.cu"
-       "sparse/gpu/conv_kernel.cu"
-       "sparse/gpu/pool_kernel.cu"
-       "strings/gpu/strings_copy_kernel.cu"
-       "strings/gpu/strings_lower_upper_kernel.cu")
+  list(
+    REMOVE_ITEM
+    kernel_cu
+    "fusion/gpu/fused_softmax_mask_grad_kernel.cu"
+    "fusion/gpu/fused_softmax_mask_kernel.cu"
+    "gpu/batch_norm_grad_kernel.cu"
+    "gpu/batch_norm_kernel.cu"
+    "gpu/cholesky_grad_kernel.cu"
+    "gpu/cholesky_solve_grad_kernel.cu"
+    "gpu/conv_grad_kernel.cu"
+    "gpu/conv_kernel.cu"
+    "gpu/cross_entropy_grad_kernel.cu"
+    "gpu/cross_entropy_kernel.cu"
+    "gpu/conv_transpose_grad_kernel.cu"
+    "gpu/conv_transpose_kernel.cu"
+    "gpu/cudnn_lstm_grad_kernel.cu"
+    "gpu/cudnn_lstm_kernel.cu"
+    "gpu/depthwise_conv_grad_kernel.cu"
+    "gpu/depthwise_conv_kernel.cu"
+    "gpu/dist_kernel.cu"
+    "gpu/elementwise_divide_grad_kernel.cu"
+    "gpu/elementwise_grad_kernel.cu"
+    "gpu/elementwise_multiply_grad_kernel.cu"
+    "gpu/erfinv_kernel.cu"
+    "gpu/exponential_kernel.cu"
+    "gpu/fft_grad_kernel.cu"
+    "gpu/fft_kernel.cu"
+    "gpu/fused_softmax_mask_grad_kernel.cu"
+    "gpu/gaussian_kernel.cu"
+    "gpu/gelu_grad_kernel.cu"
+    "gpu/gelu_kernel.cu"
+    "gpu/histogram_kernel.cu"
+    "gpu/instance_norm_grad_kernel.cu"
+    "gpu/instance_norm_kernel.cu"
+    "gpu/interpolate_grad_kernel.cu"
+    "gpu/kthvalue_grad_kernel.cu"
+    "gpu/kthvalue_kernel.cu"
+    "gpu/layer_norm_grad_kernel.cu"
+    "gpu/layer_norm_kernel.cu"
+    "gpu/llm_int8_mat_mul_kernel.cu"
+    "gpu/log_softmax_grad_kernel.cu"
+    "gpu/log_softmax_kernel.cu"
+    "gpu/lstsq_kernel.cu"
+    "gpu/nanmedian_kernel.cu"
+    "gpu/rnn_grad_kernel.cu.cc"
+    "gpu/rnn_kernel.cu.cc"
+    "gpu/slogdeterminant_grad_kernel.cu"
+    "gpu/softmax_grad_kernel.cu"
+    "gpu/softmax_kernel.cu"
+    "gpu/solve_grad_kernel.cu"
+    "gpu/solve_kernel.cu"
+    "gpu/spectral_norm_grad_kernel.cu"
+    "gpu/spectral_norm_kernel.cu"
+    "gpu/stft_kernel.cu"
+    "gpu/svd_grad_kernel.cu"
+    "gpu/top_k_grad_kernel.cu"
+    "gpu/top_k_kernel.cu"
+    "gpu/truncated_gaussian_random_kernel.cu"
+    "gpudnn/affine_grid_grad_kernel.cu"
+    "gpudnn/affine_grid_kernel.cu"
+    "gpudnn/softmax_grad_kernel.cu"
+    "gpudnn/softmax_kernel.cu"
+    "gpudnn/conv_grad_kernel.cu"
+    "gpudnn/conv_kernel.cu"
+    "gpudnn/conv_transpose_grad_kernel.cu"
+    "gpudnn/conv_transpose_kernel.cu"
+    "gpudnn/pool_grad_kernel.cu"
+    "gpudnn/pool_kernel.cu"
+    "sparse/gpu/softmax_grad_kernel.cu"
+    "sparse/gpu/softmax_kernel.cu"
+    "sparse/gpu/conv_kernel.cu"
+    "sparse/gpu/pool_kernel.cu"
+    "strings/gpu/strings_copy_kernel.cu"
+    "strings/gpu/strings_lower_upper_kernel.cu")
 endif()
 
 if(APPLE OR WIN32)
@@ -194,7 +196,9 @@ file(
   "xpu/*.cc" "legacy/xpu/*.cc" "selected_rows/xpu/*.cc" "fusion/xpu/*.cc"
   "sparse/xpu/*.cc")
 
-if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
+if(WITH_GPU
+   OR WITH_ROCM
+   OR WITH_MUSA)
   collect_srcs(kernels_srcs SRCS ${kernel_cu})
   kernel_declare("${kernel_cu}")
 endif()
diff --git a/paddle/phi/kernels/activation_kernel.cc b/paddle/phi/kernels/activation_kernel.cc
index 9626621ae8657..9dffd348ec62b 100644
--- a/paddle/phi/kernels/activation_kernel.cc
+++ b/paddle/phi/kernels/activation_kernel.cc
@@ -32,7 +32,8 @@ using complex128 = ::phi::dtype::complex<double>;
 
 PD_REGISTER_KERNEL(relu6, CPU, ALL_LAYOUT, phi::Relu6Kernel, float, double) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(relu6,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/assign_kernel.cc b/paddle/phi/kernels/assign_kernel.cc
index c44b6333154cc..73fc6b4100cb4 100644
--- a/paddle/phi/kernels/assign_kernel.cc
+++ b/paddle/phi/kernels/assign_kernel.cc
@@ -135,7 +135,8 @@ PD_REGISTER_KERNEL(assign_value,
                    int8_t,
                    int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL_FOR_ALL_DTYPE(assign,
                                  GPU,
                                  ALL_LAYOUT,
diff --git a/paddle/phi/kernels/check_memory_continue_kernel.cc b/paddle/phi/kernels/check_memory_continue_kernel.cc
index 9f4b51281cd37..661b287071fc5 100644
--- a/paddle/phi/kernels/check_memory_continue_kernel.cc
+++ b/paddle/phi/kernels/check_memory_continue_kernel.cc
@@ -88,7 +88,8 @@ PD_REGISTER_KERNEL(check_memory_continue,
                    float,
                    double) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(check_memory_continue,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/dist_grad_kernel.cc b/paddle/phi/kernels/dist_grad_kernel.cc
index 442290c3648e2..638efeb4e3257 100644
--- a/paddle/phi/kernels/dist_grad_kernel.cc
+++ b/paddle/phi/kernels/dist_grad_kernel.cc
@@ -97,7 +97,8 @@ void DistGradKernel(const Context& dev_ctx,
 PD_REGISTER_KERNEL(
     dist_grad, CPU, ALL_LAYOUT, phi::DistGradKernel, float, double) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(
     dist_grad, GPU, ALL_LAYOUT, phi::DistGradKernel, float, double) {}
 #endif
diff --git a/paddle/phi/kernels/flatten_grad_kernel.cc b/paddle/phi/kernels/flatten_grad_kernel.cc
index 3ecef871d211d..476cfc810acf8 100644
--- a/paddle/phi/kernels/flatten_grad_kernel.cc
+++ b/paddle/phi/kernels/flatten_grad_kernel.cc
@@ -46,7 +46,8 @@ PD_REGISTER_KERNEL(flatten_grad,
                    int,
                    int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(flatten_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/impl/segment_pool_kernel_impl.h b/paddle/phi/kernels/impl/segment_pool_kernel_impl.h
index f5a0998505dce..de17e513d9255 100644
--- a/paddle/phi/kernels/impl/segment_pool_kernel_impl.h
+++ b/paddle/phi/kernels/impl/segment_pool_kernel_impl.h
@@ -64,7 +64,8 @@ void SegmentKernelLaunchHelper(const Context& dev_ctx,
     phi::funcs::SetConstant<Context, T> set_zero;
     set_zero(dev_ctx, out, static_cast<T>(0));
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (!cpu_place) {
     DenseTensor length;
     length.Resize(phi::make_ddim({1}));
diff --git a/paddle/phi/kernels/impl/warprnnt_kernel_impl.h b/paddle/phi/kernels/impl/warprnnt_kernel_impl.h
index bc12e17ae55fb..64659bd1620a9 100644
--- a/paddle/phi/kernels/impl/warprnnt_kernel_impl.h
+++ b/paddle/phi/kernels/impl/warprnnt_kernel_impl.h
@@ -139,7 +139,8 @@ class WarpRNNTFunctor {
     rnntStatus_t status = RNNT_STATUS_UNKNOWN_ERROR;
     bool gpu = false;
     if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       gpu = true;
 #else
       PADDLE_THROW(errors::PreconditionNotMet(
diff --git a/paddle/phi/kernels/kps/elementwise_kernel.cu b/paddle/phi/kernels/kps/elementwise_kernel.cu
index d72d051ba1bf8..0f8e6b788cfaf 100644
--- a/paddle/phi/kernels/kps/elementwise_kernel.cu
+++ b/paddle/phi/kernels/kps/elementwise_kernel.cu
@@ -87,7 +87,8 @@ void ElementwisePowKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
 PD_REGISTER_KERNEL(maximum,
                    KPS,
diff --git a/paddle/phi/kernels/layer_norm_kernel.h b/paddle/phi/kernels/layer_norm_kernel.h
index ee8a324e09b4f..89e5912a8d539 100644
--- a/paddle/phi/kernels/layer_norm_kernel.h
+++ b/paddle/phi/kernels/layer_norm_kernel.h
@@ -30,7 +30,8 @@ void LayerNormKernel(const Context& ctx,
                      DenseTensor* mean,
                      DenseTensor* variance);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 template <typename T, typename U>
 class LayerNormDirectCUDAFunctor {
  public:
diff --git a/paddle/phi/kernels/memcpy_kernel.cc b/paddle/phi/kernels/memcpy_kernel.cc
index 62a6cbc8ea840..71d9654a8ea80 100644
--- a/paddle/phi/kernels/memcpy_kernel.cc
+++ b/paddle/phi/kernels/memcpy_kernel.cc
@@ -117,7 +117,8 @@ void MemcpyKernel(const Context& dev_ctx,
       dev_ctx.HostAlloc(out, out->dtype());
       Copy(dev_ctx, x, CPUPlace(), true, out);
       break;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     case 1: /* CUDAPlace */
       dev_ctx.Alloc(out, x.dtype());
       Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out);
@@ -162,7 +163,8 @@ PD_REGISTER_KERNEL_FOR_ALL_DTYPE(memcpy,
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL_FOR_ALL_DTYPE(memcpy_h2d,
                                  GPU,
                                  ALL_LAYOUT,
diff --git a/paddle/phi/kernels/npu_identity_kernel.cc b/paddle/phi/kernels/npu_identity_kernel.cc
index 12d933af78733..2454a66204c29 100644
--- a/paddle/phi/kernels/npu_identity_kernel.cc
+++ b/paddle/phi/kernels/npu_identity_kernel.cc
@@ -62,7 +62,8 @@ PD_REGISTER_KERNEL(npu_identity,
                    bool,
                    phi::dtype::float16) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(npu_identity,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/prod_kernel.cc b/paddle/phi/kernels/prod_kernel.cc
index 4e5546ca0df01..ce199e3fdfe4e 100644
--- a/paddle/phi/kernels/prod_kernel.cc
+++ b/paddle/phi/kernels/prod_kernel.cc
@@ -40,7 +40,8 @@ PD_REGISTER_KERNEL(prod_infer,
                    int,
                    int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(prod_infer,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/reduce_all_kernel.cc b/paddle/phi/kernels/reduce_all_kernel.cc
index 3b33c7f665e79..da0bc669ff82c 100644
--- a/paddle/phi/kernels/reduce_all_kernel.cc
+++ b/paddle/phi/kernels/reduce_all_kernel.cc
@@ -40,7 +40,8 @@ void AllKernel(const Context& dev_ctx,
 
 PD_REGISTER_KERNEL(all, CPU, ALL_LAYOUT, phi::AllKernel, bool) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(all, GPU, ALL_LAYOUT, phi::AllKernel, bool) {}
 #endif
 
diff --git a/paddle/phi/kernels/reduce_amax_kernel.cc b/paddle/phi/kernels/reduce_amax_kernel.cc
index 466d0497b2d8e..415a0f46a25fa 100644
--- a/paddle/phi/kernels/reduce_amax_kernel.cc
+++ b/paddle/phi/kernels/reduce_amax_kernel.cc
@@ -34,7 +34,8 @@ void AMaxKernel(const Context& dev_ctx,
 PD_REGISTER_KERNEL(
     amax, CPU, ALL_LAYOUT, phi::AMaxKernel, float, double, int, int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(
     amax, GPU, ALL_LAYOUT, phi::AMaxKernel, float, double, int, int64_t) {}
 #endif
diff --git a/paddle/phi/kernels/reduce_amin_kernel.cc b/paddle/phi/kernels/reduce_amin_kernel.cc
index a30ab4a91956d..cf5a7d1b96a57 100644
--- a/paddle/phi/kernels/reduce_amin_kernel.cc
+++ b/paddle/phi/kernels/reduce_amin_kernel.cc
@@ -34,7 +34,8 @@ void AMinKernel(const Context& dev_ctx,
 PD_REGISTER_KERNEL(
     amin, CPU, ALL_LAYOUT, phi::AMinKernel, float, double, int, int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(
     amin, GPU, ALL_LAYOUT, phi::AMinKernel, float, double, int, int64_t) {}
 #endif
diff --git a/paddle/phi/kernels/reduce_any_kernel.cc b/paddle/phi/kernels/reduce_any_kernel.cc
index 0b6f4028b62ac..c563433f5585f 100644
--- a/paddle/phi/kernels/reduce_any_kernel.cc
+++ b/paddle/phi/kernels/reduce_any_kernel.cc
@@ -33,7 +33,8 @@ void AnyKernel(const Context& dev_ctx,
 
 PD_REGISTER_KERNEL(any, CPU, ALL_LAYOUT, phi::AnyKernel, bool) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(any, GPU, ALL_LAYOUT, phi::AnyKernel, bool) {}
 #endif
 
diff --git a/paddle/phi/kernels/reduce_mean_kernel.cc b/paddle/phi/kernels/reduce_mean_kernel.cc
index fb8ea2f97bbea..1b3f98747d098 100644
--- a/paddle/phi/kernels/reduce_mean_kernel.cc
+++ b/paddle/phi/kernels/reduce_mean_kernel.cc
@@ -41,7 +41,8 @@ PD_REGISTER_KERNEL(mean,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(mean,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/reduce_min_kernel.cc b/paddle/phi/kernels/reduce_min_kernel.cc
index 31234c2660230..d702cab1f521b 100644
--- a/paddle/phi/kernels/reduce_min_kernel.cc
+++ b/paddle/phi/kernels/reduce_min_kernel.cc
@@ -58,8 +58,15 @@ PD_REGISTER_KERNEL(
 #endif
 
 #if defined(PADDLE_WITH_MUSA)
-PD_REGISTER_KERNEL(
-    min, GPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t, phi::dtype::float16) {}
+PD_REGISTER_KERNEL(min,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MinKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
 #endif
 
 #if defined(PADDLE_WITH_XPU_KP) && !defined(PADDLE_WITH_XPU)
diff --git a/paddle/phi/kernels/reduce_sum_kernel.cc b/paddle/phi/kernels/reduce_sum_kernel.cc
index 59d192014da1d..d60ee7bd049ab 100644
--- a/paddle/phi/kernels/reduce_sum_kernel.cc
+++ b/paddle/phi/kernels/reduce_sum_kernel.cc
@@ -53,7 +53,8 @@ PD_REGISTER_KERNEL(sum,
   kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(sum,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/selected_rows/activation_kernel.cc b/paddle/phi/kernels/selected_rows/activation_kernel.cc
index 6bd55f701bb33..4206e145a820e 100644
--- a/paddle/phi/kernels/selected_rows/activation_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/activation_kernel.cc
@@ -49,7 +49,8 @@ PD_REGISTER_KERNEL(
 PD_REGISTER_KERNEL(
     sqrt_sr, CPU, ALL_LAYOUT, phi::sr::SqrtKernel, float, double) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
 PD_REGISTER_KERNEL(square_sr,
                    GPU,
diff --git a/paddle/phi/kernels/selected_rows/full_kernel.cc b/paddle/phi/kernels/selected_rows/full_kernel.cc
index b593e6db3f936..ee74928204c00 100644
--- a/paddle/phi/kernels/selected_rows/full_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/full_kernel.cc
@@ -15,7 +15,8 @@ limitations under the License. */
 #include "paddle/phi/kernels/selected_rows/full_kernel.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #endif
 #include "paddle/phi/common/bfloat16.h"
@@ -54,7 +55,8 @@ PD_REGISTER_KERNEL(full_sr,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(full_sr,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/selected_rows/isfinite_kernel.cc b/paddle/phi/kernels/selected_rows/isfinite_kernel.cc
index e3489f50e2184..76d967477d23e 100644
--- a/paddle/phi/kernels/selected_rows/isfinite_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/isfinite_kernel.cc
@@ -15,7 +15,8 @@
 #include "paddle/phi/kernels/selected_rows/isfinite_kernel.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #endif
 #include "paddle/phi/core/kernel_registry.h"
@@ -51,7 +52,8 @@ PD_REGISTER_KERNEL(isfinite_sr,
                    int,
                    int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(isinf_sr,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/selected_rows/merge_selected_rows_kernel.cc b/paddle/phi/kernels/selected_rows/merge_selected_rows_kernel.cc
index 7b6f7e9ceefa4..3d0832d560dd4 100644
--- a/paddle/phi/kernels/selected_rows/merge_selected_rows_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/merge_selected_rows_kernel.cc
@@ -41,7 +41,8 @@ PD_REGISTER_KERNEL(merge_selected_rows,
                    float,
                    double) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(merge_selected_rows,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/selected_rows/uniform_kernel.cc b/paddle/phi/kernels/selected_rows/uniform_kernel.cc
index 90bee1744e962..6c161f6b1ba9b 100644
--- a/paddle/phi/kernels/selected_rows/uniform_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/uniform_kernel.cc
@@ -77,7 +77,8 @@ PD_REGISTER_KERNEL(uniform_sr,
                    double,
                    phi::dtype::bfloat16) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
 PD_REGISTER_KERNEL(uniform_raw_sr,
                    GPU,
diff --git a/paddle/phi/kernels/sparse/empty_kernel.cc b/paddle/phi/kernels/sparse/empty_kernel.cc
index 44ccdd3bda634..4362bb75ec382 100644
--- a/paddle/phi/kernels/sparse/empty_kernel.cc
+++ b/paddle/phi/kernels/sparse/empty_kernel.cc
@@ -82,7 +82,8 @@ PD_REGISTER_KERNEL(empty_like_csr,
   kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(empty_like_coo,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
index 0577633e53e5a..8db7c5fc2dca7 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
@@ -232,7 +232,7 @@ void CsrToCooGPUKernel(const GPUContext& dev_ctx,
   if (x.nnz() <= 0) {
 #ifdef PADDLE_WITH_HIP
     DenseTensor indices = phi::Empty<int>(dev_ctx, {sparse_dim, non_zero_num});
-#else // MUSA and CUDA
+#else  // MUSA and CUDA
     DenseTensor indices = phi::Empty<IntT>(dev_ctx, {sparse_dim, non_zero_num});
 #endif
     DenseTensor values = phi::EmptyLike<T, GPUContext>(dev_ctx, x.values());
@@ -247,7 +247,7 @@ void CsrToCooGPUKernel(const GPUContext& dev_ctx,
   const auto& csr_cols = Cast<IntT>(dev_ctx, x.cols(), DataType::INT32);
   const int* csr_crows_data = csr_crows.template data<int>();
   const int* csr_cols_data = csr_cols.template data<int>();
-#else // MUSA & CUDA
+#else  // MUSA & CUDA
   const auto& csr_crows = x.crows();
   const auto& csr_cols = x.cols();
   const IntT* csr_crows_data = csr_crows.data<IntT>();
@@ -264,7 +264,7 @@ void CsrToCooGPUKernel(const GPUContext& dev_ctx,
   int* coo_indices = indices.data<int>();
   int* coo_rows_data = coo_indices;
   int* coo_cols_data = coo_rows_data + non_zero_num;
-#else // MUSA & CUDA
+#else  // MUSA & CUDA
   DenseTensor indices = phi::Empty<IntT>(dev_ctx, {sparse_dim, non_zero_num});
   DenseTensor offsets = phi::Empty<IntT>(dev_ctx, {batches});
   IntT* coo_indices = indices.data<IntT>();
@@ -312,7 +312,7 @@ void CsrToCooGPUKernel(const GPUContext& dev_ctx,
                                     coo_rows_data,
                                     rocsparse_index_base_zero);
   });
-#else // MUSA & CUDA
+#else  // MUSA & CUDA
   auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rows, 1);
   config.block_per_grid.y = batches;
   ConvertCsrCrowsToCooRows<IntT>
@@ -323,7 +323,7 @@ void CsrToCooGPUKernel(const GPUContext& dev_ctx,
                                      csr_cols_data,
 #ifdef PADDLE_WITH_HIP
                                      sizeof(int) * non_zero_num,
-#else // MUSA & CUDA
+#else  // MUSA & CUDA
                                      sizeof(IntT) * non_zero_num,
 #endif
                                      gpuMemcpyDeviceToDevice,
diff --git a/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc b/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc
index 8e9ed654760f3..604ca3f23b2b6 100644
--- a/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc
@@ -81,7 +81,8 @@ PD_REGISTER_KERNEL(sparse_coo_tensor_grad,
   kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_COO);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(values_coo_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/squeeze_kernel.cc b/paddle/phi/kernels/squeeze_kernel.cc
index 933540cd787e4..956fd3280c38c 100644
--- a/paddle/phi/kernels/squeeze_kernel.cc
+++ b/paddle/phi/kernels/squeeze_kernel.cc
@@ -74,7 +74,8 @@ PD_REGISTER_KERNEL(squeeze,
                    int64_t,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(squeeze_infer,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/strings/gpu/copy_utils.h b/paddle/phi/kernels/strings/gpu/copy_utils.h
index c462ddec7a351..d061ef4da2aca 100644
--- a/paddle/phi/kernels/strings/gpu/copy_utils.h
+++ b/paddle/phi/kernels/strings/gpu/copy_utils.h
@@ -23,7 +23,8 @@ limitations under the License. */
 namespace phi {
 namespace strings {
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 __global__ void SerializeStringsData(const phi::dtype::pstring* src_str,
                                      uint8_t* strings_data,
                                      int32_t* strings_offset,
@@ -149,7 +150,8 @@ void DeserializeOnCPU(const Context& dev_ctx,
   }
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 void SerializeOnGPU(const phi::GPUContext& dev_ctx,
                     const StringTensor& src,
                     DenseTensor* dst) {
diff --git a/paddle/phi/kernels/strings/strings_empty_kernel.cc b/paddle/phi/kernels/strings/strings_empty_kernel.cc
index 60a75584587d3..d225803cbc5be 100644
--- a/paddle/phi/kernels/strings/strings_empty_kernel.cc
+++ b/paddle/phi/kernels/strings/strings_empty_kernel.cc
@@ -49,7 +49,8 @@ PD_REGISTER_KERNEL_FOR_ALL_DTYPE(
     ALL_LAYOUT,
     phi::strings::EmptyLikeKernel<phi::CPUContext>) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL_FOR_ALL_DTYPE(strings_empty,
                                  GPU,
                                  ALL_LAYOUT,
diff --git a/paddle/phi/kernels/strings/unicode.h b/paddle/phi/kernels/strings/unicode.h
index 89ec9efa15189..0e2d129753f52 100644
--- a/paddle/phi/kernels/strings/unicode.h
+++ b/paddle/phi/kernels/strings/unicode.h
@@ -188,7 +188,8 @@ HOSTDEVICE inline void GetUTF8Str(const uint32_t* unicode_str,
 const uint8_t* GetUniFlagMap();
 const uint16_t* GetCharcasesMap();
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
 const uint8_t* GetGPUUniflagMap();
 const uint16_t* GetGPUCharcasesMap();
diff --git a/paddle/phi/kernels/transfer_layout_kernel.cc b/paddle/phi/kernels/transfer_layout_kernel.cc
index 5ee69e5964918..d63b02fbc8772 100644
--- a/paddle/phi/kernels/transfer_layout_kernel.cc
+++ b/paddle/phi/kernels/transfer_layout_kernel.cc
@@ -71,7 +71,8 @@ void TransferLayoutGeneral(const Context& dev_ctx,
 
   out->Resize(phi::make_ddim(dst_dim));
   dev_ctx.Alloc(out, x.dtype());
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   // In GPU fp16 model, we will insert many transfer_layout ops in
   // conv2d_fusion_layout_transfer_pass, so we optimize this kernel on GPU
   if (std::is_same<Context, phi::GPUContext>::value) {
@@ -221,7 +222,8 @@ PD_REGISTER_KERNEL_FOR_ALL_DTYPE(transfer_layout,
                                  CPU,
                                  ALL_LAYOUT,
                                  phi::TransferLayoutKernel<phi::CPUContext>) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL_FOR_ALL_DTYPE(transfer_layout,
                                  GPU,
                                  ALL_LAYOUT,
diff --git a/paddle/phi/kernels/unsqueeze_grad_kernel.cc b/paddle/phi/kernels/unsqueeze_grad_kernel.cc
index e294c3a983769..a4fc6cf91e13a 100644
--- a/paddle/phi/kernels/unsqueeze_grad_kernel.cc
+++ b/paddle/phi/kernels/unsqueeze_grad_kernel.cc
@@ -49,7 +49,8 @@ PD_REGISTER_KERNEL(unsqueeze_grad,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(unsqueeze_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/unsqueeze_kernel.cc b/paddle/phi/kernels/unsqueeze_kernel.cc
index 6e03176857e4c..3cf8f1d002109 100644
--- a/paddle/phi/kernels/unsqueeze_kernel.cc
+++ b/paddle/phi/kernels/unsqueeze_kernel.cc
@@ -80,7 +80,8 @@ PD_REGISTER_KERNEL(unsqueeze,
                    int64_t,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(unsqueeze_infer,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
index c3ca58e0e4a94..3eb0f88dd8811 100644
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -19,7 +19,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/init.h"
 #include "paddle/phi/core/flags.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 DECLARE_bool(enable_gpu_memory_usage_log);
 #endif
 
@@ -84,7 +85,8 @@ int main(int argc, char** argv) {
     VLOG(1) << "gtest undefok_string:" << undefok_string;
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (strstr(undefok_str, "enable_gpu_memory_usage_log")) {
     VLOG(1) << "Set FLAGS_enable_gpu_memory_usage_log to true";
     FLAGS_enable_gpu_memory_usage_log = true;

From e17119f10bc1f523975d78af1c966aa6d3974ac4 Mon Sep 17 00:00:00 2001
From: CaiZhi <zhi.cai@mthreads.com>
Date: Tue, 15 Aug 2023 19:13:06 +0800
Subject: [PATCH 50/55] [MTAI-484] fix(build): modify code format for cpplint

---
 .pre-commit-config.yaml                       |  2 +-
 .../eager_generated/backwards/scale_node.cc   |  3 +-
 .../details/gather_op_handle_test.cc          |  6 ++-
 paddle/fluid/framework/dlpack_tensor.cc       |  6 ++-
 .../framework/ir/fuse_bn_add_act_pass.cc      |  3 +-
 ...est_reference_count_pass_last_lived_ops.cc |  3 +-
 .../interpreter/execution_config.cc           |  3 +-
 paddle/fluid/framework/op_registry.h          |  3 +-
 paddle/fluid/framework/phi_utils.cc           |  3 +-
 paddle/fluid/framework/phi_utils.h            |  3 +-
 paddle/fluid/framework/tensor_util.cc         | 25 +++++----
 paddle/fluid/framework/tensor_util.h          | 12 +++--
 paddle/fluid/framework/trainer.h              |  9 ++--
 paddle/fluid/inference/api/analysis_config.cc | 15 ++++--
 paddle/fluid/inference/api/api_impl.cc        |  3 +-
 paddle/fluid/inference/api/infer_context.h    |  3 +-
 .../fluid/inference/api/resource_manager.cc   |  9 ++--
 paddle/fluid/inference/api/resource_manager.h |  9 ++--
 paddle/fluid/inference/lite/tensor_utils.cc   |  3 +-
 paddle/fluid/memory/allocation/CMakeLists.txt |  4 +-
 .../memory/allocation/allocator_facade.cc     | 42 ++++++++++-----
 .../memory/allocation/allocator_facade.h      |  3 +-
 .../allocator_facade_abs_flags_test.cc        |  9 ++--
 .../allocator_facade_frac_flags_test.cc       |  9 ++--
 ...o_growth_best_fit_allocator_facade_test.cc |  9 ++--
 .../memory/allocation/buddy_allocator.cc      |  9 ++--
 .../memory/allocation/buddy_allocator_test.cc |  6 ++-
 .../allocation/naive_best_fit_allocator.cc    | 39 +++++++++-----
 .../naive_best_fit_allocator_test.cc          |  3 +-
 .../memory/allocation/retry_allocator_test.cc |  6 ++-
 .../allocation/stream_safe_cuda_allocator.h   |  2 +-
 .../memory/allocation/system_allocator.cc     |  6 ++-
 .../memory/allocation/system_allocator.h      |  3 +-
 .../allocation/system_allocator_test.cc       |  3 +-
 paddle/fluid/memory/malloc.cc                 |  3 +-
 paddle/fluid/memory/malloc.h                  |  3 +-
 paddle/fluid/memory/memcpy.cc                 |  6 ++-
 paddle/fluid/memory/memory_stats_test.cc      |  3 +-
 .../fluid/operators/array_to_lod_tensor_op.cc |  3 +-
 .../collective/c_sync_calc_stream_op.h        |  4 +-
 .../controlflow/conditional_block_op.h        |  3 +-
 paddle/fluid/operators/controlflow/feed_op.cc |  3 +-
 .../operators/controlflow/get_places_op.cc    |  6 ++-
 .../operators/controlflow/while_op_helper.cc  |  5 +-
 .../operators/detection/target_assign_op.h    | 12 +++--
 paddle/fluid/operators/dgc_op.h               |  3 +-
 paddle/fluid/operators/expand_as_op.cc        |  3 +-
 paddle/fluid/operators/expand_op.cc           |  3 +-
 .../fluid/operators/fused/yolo_box_post_op.cu | 10 ++--
 .../fused_softmax_mask_upper_triangle_op.cu   |  2 +-
 .../get_tensor_from_selected_rows_op.cc       |  3 +-
 .../fluid/operators/graph_khop_sampler_op.cu  |  2 +-
 paddle/fluid/operators/hinge_loss_op.cc       |  3 +-
 paddle/fluid/operators/im2sequence_op.cc      |  3 +-
 paddle/fluid/operators/isfinite_op.h          | 12 +++--
 paddle/fluid/operators/l1_norm_op.cc          |  3 +-
 paddle/fluid/operators/load_op.cc             |  3 +-
 .../fluid/operators/lod_tensor_to_array_op.cc |  3 +-
 .../operators/math/bert_encoder_functor.h     |  3 +-
 paddle/fluid/operators/math/prelu.h           |  3 +-
 paddle/fluid/operators/math/sample_prob.h     |  3 +-
 paddle/fluid/operators/matmul_op.cc           |  3 +-
 paddle/fluid/operators/memcpy_h2d_op.h        |  3 +-
 paddle/fluid/operators/merge_lod_tensor_op.cc |  3 +-
 paddle/fluid/operators/minus_op.cc            |  3 +-
 paddle/fluid/operators/nop_op.cc              |  3 +-
 .../optimizers/distributed_fused_lamb_op.cu   |  1 -
 .../fluid/operators/pad_constant_like_op.cc   |  3 +-
 .../operators/pscore/send_and_recv_op.cc      |  3 +-
 paddle/fluid/operators/random_crop_op.h       |  6 ++-
 paddle/fluid/operators/rank_loss_op.cc        |  3 +-
 .../fluid/operators/reader/buffered_reader.cc |  6 ++-
 .../fluid/operators/reader/buffered_reader.h  |  6 ++-
 paddle/fluid/operators/reshape_op.cc          | 12 +++--
 paddle/fluid/operators/save_op.cc             |  3 +-
 paddle/fluid/operators/select_op_helper.h     |  5 +-
 .../sequence_ops/sequence_reverse_op.h        |  6 ++-
 .../sequence_ops/sequence_softmax_op.cc       |  3 +-
 paddle/fluid/operators/shuffle_batch_op.cu    |  3 +-
 paddle/fluid/operators/split_lod_tensor_op.cc |  3 +-
 paddle/fluid/platform/complex_test.cu         |  3 +-
 paddle/fluid/platform/device/device_wrapper.h |  3 +-
 paddle/fluid/platform/device/gpu/gpu_helper.h |  3 +-
 paddle/fluid/platform/device/gpu/gpu_info.cc  |  2 +-
 paddle/fluid/platform/device/gpu/gpu_info.h   |  3 +-
 .../platform/device/gpu/gpu_launch_config.h   |  3 +-
 .../platform/device/gpu/gpu_resource_pool.cc  |  3 +-
 .../platform/device/gpu/gpu_resource_pool.h   |  3 +-
 paddle/fluid/platform/device/gpu/gpu_types.h  | 23 +++++++--
 paddle/fluid/platform/device_code_test.cc     |  3 +-
 paddle/fluid/platform/device_context.cc       | 15 ++++--
 paddle/fluid/platform/device_context.h        |  6 ++-
 paddle/fluid/platform/device_event.h          |  3 +-
 paddle/fluid/platform/device_event_gpu.cc     |  3 +-
 paddle/fluid/platform/dynload/musparse.h      | 42 +++++++--------
 paddle/fluid/platform/enforce.h               |  3 +-
 paddle/fluid/platform/enforce_test.cc         |  3 +-
 paddle/fluid/platform/init.cc                 | 15 ++++--
 paddle/fluid/platform/init_test.cc            |  3 +-
 paddle/fluid/platform/place.h                 |  6 ++-
 paddle/fluid/platform/profiler.cc             |  3 +-
 paddle/fluid/platform/profiler.h              |  6 ++-
 .../platform/profiler/chrometracing_logger.cc |  3 +-
 .../platform/profiler/chrometracing_logger.h  |  3 +-
 .../profiler/dump/deserialization_reader.cc   |  6 ++-
 .../profiler/dump/deserialization_reader.h    |  3 +-
 .../profiler/dump/serialization_logger.cc     |  3 +-
 .../profiler/dump/serialization_logger.h      |  3 +-
 .../fluid/platform/profiler/event_python.cc   |  9 ++--
 paddle/fluid/platform/profiler/event_python.h |  9 ++--
 paddle/fluid/platform/profiler/profiler.cc    |  6 ++-
 paddle/fluid/platform/profiler_helper.h       |  6 ++-
 paddle/fluid/platform/profiler_test.cc        |  3 +-
 .../fluid/platform/stream_callback_manager.cc |  9 ++--
 paddle/fluid/pybind/cuda_streams_py.cc        | 30 +++++++----
 paddle/fluid/pybind/cuda_streams_py.h         |  6 ++-
 paddle/fluid/pybind/eager_functions.cc        |  3 +-
 paddle/fluid/pybind/eager_math_op_patch.cc    |  3 +-
 paddle/fluid/pybind/eager_method.cc           |  6 ++-
 paddle/fluid/pybind/generator_py.cc           |  3 +-
 paddle/fluid/pybind/inference_api.cc          | 18 ++++---
 paddle/fluid/pybind/parallel_executor.cc      |  3 +-
 paddle/fluid/pybind/place.cc                  | 15 ++++--
 paddle/fluid/pybind/process_group_utils.h     |  6 ++-
 paddle/fluid/pybind/pybind.cc                 | 27 ++++++----
 paddle/fluid/pybind/tensor.cc                 |  3 +-
 paddle/fluid/pybind/tensor_py.h               | 18 ++++---
 paddle/phi/api/include/context_pool.h         |  3 +-
 paddle/phi/api/include/tensor.h               |  3 +-
 paddle/phi/api/lib/context_pool.cc            |  6 ++-
 paddle/phi/api/lib/data_transform.cc          |  9 ++--
 paddle/phi/api/lib/tensor.cc                  |  3 +-
 paddle/phi/api/lib/tensor_utils.cc            |  6 ++-
 paddle/phi/api/profiler/event.h               | 12 +++--
 paddle/phi/backends/context_pool.cc           |  3 +-
 paddle/phi/backends/context_pool.h            |  6 ++-
 paddle/phi/backends/device_code.cc            | 16 +++---
 paddle/phi/backends/device_code.h             |  5 +-
 paddle/phi/backends/device_memory_aligment.h  |  3 +-
 paddle/phi/backends/dynload/musparse.h        | 42 +++++++--------
 paddle/phi/backends/gpu/gpu_context.cc        | 18 ++++---
 paddle/phi/backends/gpu/gpu_context.h         | 10 ++--
 paddle/phi/backends/gpu/gpu_decls.h           | 10 +++-
 paddle/phi/backends/gpu/gpu_device_function.h |  3 +-
 paddle/phi/backends/gpu/gpu_dnn.h             |  3 +-
 paddle/phi/backends/gpu/gpu_helper.h          |  3 +-
 paddle/phi/backends/gpu/gpu_info.h            |  3 +-
 paddle/phi/backends/gpu/gpu_launch_config.h   |  3 +-
 paddle/phi/backends/gpu/gpu_resources.cc      |  2 -
 paddle/phi/backends/gpu/gpu_types.h           | 17 +++++--
 paddle/phi/capi/lib/c_device_context.cc       |  3 +-
 paddle/phi/capi/lib/c_kernel_context.cc       |  3 +-
 paddle/phi/common/backend.h                   |  3 +-
 paddle/phi/common/bfloat16.h                  |  1 -
 paddle/phi/common/complex.h                   |  6 ++-
 paddle/phi/common/float16.h                   | 22 +++++---
 paddle/phi/common/memory_utils.cc             |  3 +-
 paddle/phi/common/memory_utils.h              |  9 ++--
 paddle/phi/common/place.cc                    |  6 ++-
 paddle/phi/core/compat/convert_utils.cc       |  9 ++--
 paddle/phi/core/enforce.h                     | 51 +++++++++----------
 paddle/phi/core/flags.cc                      | 26 ++++++----
 paddle/phi/core/generator.cc                  |  6 ++-
 paddle/phi/core/kernel_factory.cc             |  6 ++-
 paddle/phi/core/kernel_registry.h             |  6 ++-
 paddle/phi/core/kernel_utils.h                |  3 +-
 paddle/phi/core/mixed_vector.cc               |  6 ++-
 paddle/phi/core/string_tensor.cc              |  3 +-
 paddle/phi/core/tensor_utils.cc               | 24 ++++++---
 paddle/phi/core/utils/type_info.cc            |  7 +--
 paddle/phi/core/utils/visit_place.h           |  6 ++-
 paddle/phi/infermeta/multiary.cc              |  3 +-
 paddle/phi/kernels/empty_kernel.cc            |  3 +-
 paddle/phi/kernels/flatten_kernel.cc          |  3 +-
 paddle/phi/kernels/full_kernel.cc             |  3 +-
 paddle/phi/kernels/funcs/blas/blas.h          | 12 +++--
 paddle/phi/kernels/funcs/blas/blas_impl.h     |  3 +-
 .../phi/kernels/funcs/detail/strided_memcpy.h |  9 ++--
 paddle/phi/kernels/funcs/dropout_impl.cu.h    |  2 +-
 paddle/phi/kernels/funcs/layer_norm_util.h    |  6 ++-
 paddle/phi/kernels/funcs/math_function.cc     |  3 +-
 paddle/phi/kernels/funcs/math_function.h      |  3 +-
 paddle/phi/kernels/funcs/pooling.h            |  9 ++--
 paddle/phi/kernels/funcs/select_impl.cu.h     |  3 +-
 paddle/phi/kernels/funcs/softmax.h            |  3 +-
 paddle/phi/kernels/funcs/strided_memcpy.h     |  4 +-
 .../fusion/gpu/fused_softmax_mask_utils.h     |  7 +--
 .../phi/kernels/gpu/batch_norm_grad_kernel.cu |  4 +-
 paddle/phi/kernels/gpu/batch_norm_kernel.cu   |  6 +--
 .../kernels/gpu/cross_entropy_grad_kernel.cu  |  2 +-
 paddle/phi/kernels/gpu/decode_jpeg_kernel.cu  |  3 +-
 .../phi/kernels/gpu/graph_reindex_kernel.cu   | 10 ++--
 .../gpu/graph_sample_neighbors_kernel.cu      |  2 +-
 .../kernels/gpu/instance_norm_grad_kernel.cu  |  2 +-
 .../phi/kernels/gpu/instance_norm_kernel.cu   |  4 +-
 paddle/phi/kernels/gpu/layer_norm_kernel.cu   |  5 +-
 .../phi/kernels/gpu/logsumexp_function.cu.h   |  2 +-
 paddle/phi/kernels/gpu/reduce.h               |  4 +-
 paddle/phi/kernels/gpu/reduce_grad.h          |  3 +-
 paddle/phi/kernels/gpu/rnn_functor.h          | 19 +++----
 paddle/phi/kernels/gpu/send_ue_recv_kernel.cu |  2 +-
 paddle/phi/kernels/gpudnn/conv_grad_kernel.cu | 26 +++++-----
 paddle/phi/kernels/gpudnn/conv_kernel.cu      |  6 +--
 paddle/phi/kernels/group_norm_kernel.h        |  3 +-
 paddle/phi/kernels/is_empty_kernel.cc         |  3 +-
 paddle/phi/kernels/reverse_kernel.cc          |  3 +-
 .../kernels/selected_rows/assign_kernel.cc    |  3 +-
 .../elementwise_multiply_kernel.cc            |  3 +-
 .../phi/kernels/selected_rows/scale_kernel.cc |  3 +-
 .../phi/kernels/selected_rows/shape_kernel.cc |  3 +-
 paddle/phi/kernels/shape_kernel.cc            |  3 +-
 paddle/phi/kernels/squeeze_grad_kernel.cc     |  3 +-
 .../phi/kernels/strided_slice_grad_kernel.cc  |  3 +-
 paddle/phi/kernels/strided_slice_kernel.cc    |  3 +-
 paddle/phi/kernels/strings/unicode.cc         |  3 +-
 215 files changed, 908 insertions(+), 530 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 24b9f0e3cb877..2855a0dbe674f 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -97,7 +97,7 @@ repos:
         files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx)$
         args:
             - --extensions=c,cc,cxx,cpp,cu,cuh,h,hpp,hxx,kps
-            - --filter=-readability/fn_size,-build/include_what_you_use,-build/c++11,-whitespace/parens,-whitespace/braces
+            - --filter=-readability/fn_size,-build/include_what_you_use,-build/c++11,-whitespace/parens,-whitespace/braces,-build/include
             - --quiet
         # Exclude third-party libraries
         exclude:  |
diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
index 7050947466d23..d3c0df2a11595 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
@@ -132,7 +132,8 @@ void ScaleAPI(const paddle::Tensor& x,
                                          bias_after_scale,
                                          dense_out.get());
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   } else if (expected_kernel_place == paddle::platform::CUDAPlace()) {
     auto* dev_ctx =
         dynamic_cast<phi::GPUContext*>(pool.Get(expected_kernel_place));
diff --git a/paddle/fluid/framework/details/gather_op_handle_test.cc b/paddle/fluid/framework/details/gather_op_handle_test.cc
index 455879f02e833..1f0895ae7dd30 100644
--- a/paddle/fluid/framework/details/gather_op_handle_test.cc
+++ b/paddle/fluid/framework/details/gather_op_handle_test.cc
@@ -47,7 +47,8 @@ struct TestGatherOpHandle {
 
   void InitCtxOnGpu(bool use_gpu) {
     if (use_gpu) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       int count = p::GetGPUDeviceCount();
       if (count <= 1) {
         LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
@@ -224,7 +225,8 @@ TEST(GatherTester, TestCPUGatherTestSelectedRows) {
   test_op.TestGatherSelectedRows(input_scope_idx);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
 TEST(GatherTester, TestGPUGatherTestSelectedRows) {
   TestGatherOpHandle test_op;
diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc
index e5e8bae0bbd79..a55e640c0be32 100644
--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -96,7 +96,8 @@ struct DLDeviceVisitor {
   }
 
   inline ::DLDevice operator()(const platform::CUDAPlace &place) const {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     ::DLDevice device;
     device.device_type = kDLGPU;
     device.device_id = place.device;
@@ -108,7 +109,8 @@ struct DLDeviceVisitor {
   }
 
   inline ::DLDevice operator()(const platform::CUDAPinnedPlace &place) const {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     ::DLDevice device;
     device.device_type = kDLCPUPinned;
     device.device_id = 0;
diff --git a/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc b/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc
index a218e768ac41d..df5cbfa9e7e0b 100644
--- a/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc
@@ -25,7 +25,8 @@ namespace framework {
 namespace ir {
 
 void FuseBatchNormAddActPass::ApplyImpl(ir::Graph *graph) const {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 4, 1)
   // forward
   std::unordered_set<std::string> act_types = {"relu"};
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
index b986fc5b37adb..67b154989d346 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
@@ -203,7 +203,8 @@ TEST(test_reference_count_pass, test_no_need_buffer_var_shrink) {
            {});
 
   std::vector<bool> use_cuda_list{false};
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   use_cuda_list.push_back(true);
 #endif
   for (auto use_cuda : use_cuda_list) {
diff --git a/paddle/fluid/framework/new_executor/interpreter/execution_config.cc b/paddle/fluid/framework/new_executor/interpreter/execution_config.cc
index aa769089d7fed..dcd12bed40ad4 100644
--- a/paddle/fluid/framework/new_executor/interpreter/execution_config.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/execution_config.cc
@@ -53,7 +53,8 @@ inline std::tuple<int, int> GetThreadPoolConfig(const phi::Place& place,
     processor_count = std::thread::hardware_concurrency();
     if (processor_count) {
       if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
         device_count = phi::backends::gpu::GetGPUDeviceCount();
 #endif
       }
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index db535b4fa58de..8bc37165e67b8 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -358,7 +358,8 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
       ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
       __VA_ARGS__)
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #define REGISTER_OP_CUDA_KERNEL(op_type, ...) \
   REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::CUDAPlace, __VA_ARGS__)
 #else
diff --git a/paddle/fluid/framework/phi_utils.cc b/paddle/fluid/framework/phi_utils.cc
index 4d1bb616c33e2..070c85d425ee0 100644
--- a/paddle/fluid/framework/phi_utils.cc
+++ b/paddle/fluid/framework/phi_utils.cc
@@ -134,7 +134,8 @@ phi::KernelKey FallBackToCpu(const phi::KernelKey& kernel_key,
         phi::Backend::CPU, kernel_key.layout(), kernel_key.dtype());
   }
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (kernel_key.backend() == phi::Backend::GPU ||
       kernel_key.backend() == phi::Backend::GPUDNN) {
     PADDLE_THROW(
diff --git a/paddle/fluid/framework/phi_utils.h b/paddle/fluid/framework/phi_utils.h
index d5262264aa0cd..33493669755e9 100644
--- a/paddle/fluid/framework/phi_utils.h
+++ b/paddle/fluid/framework/phi_utils.h
@@ -72,7 +72,8 @@ struct ConvertToPhiContext<phi::CPUContext> {
   using TYPE = phi::CPUContext;
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 template <>
 struct ConvertToPhiContext<phi::GPUContext> {
   using TYPE = phi::GPUContext;
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 50f23057c61b1..df8bfcbb5d473 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -124,7 +124,8 @@ void TensorCopyImpl(const TENSOR& src,
         "Copy from %s to %s is not supported.", src_place, dst_place));
   }
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
            platform::is_cuda_pinned_place(dst_place)) {
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
@@ -377,7 +378,8 @@ void TensorCopySync(const phi::DenseTensor& src,
         "Copy from %s to %s is not supported.", src_place, dst_place));
   }
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
            platform::is_cuda_pinned_place(dst_place)) {
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
@@ -479,7 +481,8 @@ void TensorToStream(std::ostream& os,
                       platform::errors::ResourceExhausted(
                           "tensor size %d overflow when writing tensor", size));
     if (platform::is_gpu_place(tensor.place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
       std::unique_ptr<char[]> buf(new char[kBufSize]);
       auto& gpu_dev_ctx = static_cast<const phi::GPUContext&>(dev_ctx);
@@ -613,8 +616,9 @@ void TensorFromStream(std::istream& is,
     if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
         platform::is_xpu_place(dev_ctx.GetPlace()) ||
         platform::is_custom_place(dev_ctx.GetPlace())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
-    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_XPU) || \
+    defined(PADDLE_WITH_CUSTOM_DEVICE)
       phi::DenseTensor cpu_tensor;
       cpu_tensor.Resize(phi::make_ddim(shape));
       framework::VisitDataType(
@@ -686,8 +690,9 @@ void TensorFromStream(std::istream& is,
     if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
         platform::is_xpu_place(dev_ctx.GetPlace()) ||
         platform::is_custom_place(dev_ctx.GetPlace())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
-    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_XPU) || \
+    defined(PADDLE_WITH_CUSTOM_DEVICE)
       phi::DenseTensor cpu_tensor;
       cpu_tensor.Resize(phi::make_ddim(dims));
       framework::VisitDataType(
@@ -809,7 +814,8 @@ void TensorFromDLPack(const ::DLTensor& dl_tensor, phi::DenseTensor* dst) {
   if (dl_tensor.device.device_type == kDLCPU) {
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (dl_tensor.device.device_type == kDLGPU) {
     platform::CUDAPlace dst_place =
         platform::CUDAPlace(dl_tensor.device.device_id);
@@ -849,7 +855,8 @@ void TensorFromDLPack(const DLManagedTensor* src, phi::DenseTensor* dst) {
     void* dst_ptr = GetDstPtrByDLDataType(type, dst, dst_place);
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (src->dl_tensor.device.device_type == kDLGPU) {
     platform::CUDAPlace dst_place =
         platform::CUDAPlace(src->dl_tensor.device.device_id);
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index 77ab6f4918caf..c9ec8f0c34d79 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -129,7 +129,8 @@ void TensorFromArray(const T* src,
   if (platform::is_cpu_place(dst_place)) {
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   else if (platform::is_gpu_place(dst_place)) {  // NOLINT
     memory::Copy(dst_place,
                  dst_ptr,
@@ -175,7 +176,8 @@ void TensorFromVector(const std::vector<T>& src,
   if (platform::is_cpu_place(dst_place)) {
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   else if (platform::is_gpu_place(dst_place)) {  // NOLINT
     memory::Copy(dst_place,
                  dst_ptr,
@@ -304,7 +306,8 @@ void TensorToVector(const phi::DenseTensor& src,
   if (platform::is_cpu_place(src.place())) {
     memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   else if (platform::is_gpu_place(src.place())) {  // NOLINT
     memory::Copy(dst_place,
                  dst_ptr,
@@ -346,7 +349,8 @@ inline void TensorToVector(const phi::DenseTensor& src,
   if (platform::is_cpu_place(src.place())) {
     memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   else if (platform::is_gpu_place(src.place())) {  // NOLINT
     memory::Copy(dst_place,
                  dst_ptr,
diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h
index 9a0d9880f5d04..4d9b39a77ec04 100644
--- a/paddle/fluid/framework/trainer.h
+++ b/paddle/fluid/framework/trainer.h
@@ -172,7 +172,8 @@ class HeterServiceContext {
   int place_num_;
   Scope* scope_{nullptr};
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   gpuEvent_t event_;
 #endif
   std::vector<OperatorBase*> ops_;
@@ -204,7 +205,8 @@ class HeterXpuTrainer : public TrainerBase {
   virtual std::string GetDumpPath(int tid) { return ""; }
   virtual void InitDumpEnv() {}
   template <typename T>
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   void HeterMemCpy(phi::DenseTensor* tensor,
                    phi::DenseTensor* root_tensor,
                    const paddle::platform::Place& thread_place,
@@ -242,7 +244,8 @@ class HeterXpuTrainer : public TrainerBase {
   std::vector<Scope*> place_scopes_;
   BtObjectPool<HeterServiceContext> object_pool_;
   std::vector<platform::Place> places_;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   std::vector<gpuStream_t> copy_streams_;
   std::vector<gpuEvent_t> events_;
 #endif
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index ea5ad99ea0be0..902034a9bd899 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -32,7 +32,8 @@
 #include "paddle/fluid/inference/tensorrt/helper.h"
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PHI_DECLARE_uint64(initial_gpu_memory_in_mb);
 #endif
 
@@ -100,7 +101,8 @@ void AnalysisConfig::SetModel(const std::string &prog_file_path,
 void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
                                   int device_id,
                                   Precision precision_mode) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   use_gpu_ = true;
   memory_pool_init_size_mb_ = memory_pool_init_size_mb;
   FLAGS_initial_gpu_memory_in_mb = memory_pool_init_size_mb_;
@@ -630,7 +632,8 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
 }
 
 void AnalysisConfig::EnableCUDNN() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   use_cudnn_ = use_gpu_;
 #else
   LOG(ERROR) << "Please compile with CUDA first to use cuDNN";
@@ -928,7 +931,8 @@ void AnalysisConfig::Update() {
   }
 
   if (use_gpu() && use_cudnn_) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     if (!enable_ir_optim_) {
       LOG(ERROR) << "EnableCUDNN() only works when IR optimization is enabled.";
     } else {
@@ -1145,7 +1149,8 @@ void AnalysisConfig::SetCpuMathLibraryNumThreads(
 }
 
 float AnalysisConfig::fraction_of_gpu_memory_for_pool() const {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   // Get the GPU memory details and calculate the fraction of memory for the
   // GPU memory pool.
   size_t gpu_total, gpu_available;
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index f69a434f36f83..faf3cedce947d 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -250,7 +250,8 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
           false,
           platform::errors::InvalidArgument(
               "Only one choice can be made between CPU and XPU."));
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       platform::DeviceContextPool &pool =
           platform::DeviceContextPool::Instance();
       auto *dev_ctx = static_cast<const phi::GPUContext *>(pool.Get(place_));
diff --git a/paddle/fluid/inference/api/infer_context.h b/paddle/fluid/inference/api/infer_context.h
index c0357a7236a80..eef3d31a5c493 100644
--- a/paddle/fluid/inference/api/infer_context.h
+++ b/paddle/fluid/inference/api/infer_context.h
@@ -26,7 +26,8 @@ class InferCPUContext : public phi::CPUContext {
   using phi::CPUContext::SetEigenDevice;
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 class InferGPUContext : public phi::GPUContext {
  public:
   explicit InferGPUContext(const phi::Place& place);
diff --git a/paddle/fluid/inference/api/resource_manager.cc b/paddle/fluid/inference/api/resource_manager.cc
index d265f29db314c..a13fa97b69185 100644
--- a/paddle/fluid/inference/api/resource_manager.cc
+++ b/paddle/fluid/inference/api/resource_manager.cc
@@ -44,7 +44,8 @@
 namespace paddle {
 namespace internal {
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 class EigenGpuStreamDevice : public Eigen::StreamInterface {
  public:
   EigenGpuStreamDevice() : scratch_(nullptr), semaphore_(nullptr) {
@@ -135,7 +136,8 @@ void CPUContextResource::InitCPUResource() {
 
 CPUContextResource::CPUContextResource() { InitCPUResource(); }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 GPUContextResource::GPUContextResource(const phi::Place& place, void* stream)
     : place_(place) {
   InitGPUResource(stream);
@@ -385,7 +387,8 @@ CPUContextResource* ResourceManager::GetCPUResource() const {
   return cpu_resource_.get();
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 void* ResourceManager::InitGPUResource(const phi::Place& place, void* stream) {
   std::lock_guard<std::mutex> lock_gurad(gpu_mutex_);
   if (gpu_resources_.count(stream)) {
diff --git a/paddle/fluid/inference/api/resource_manager.h b/paddle/fluid/inference/api/resource_manager.h
index 99198c6e7b4e6..36841a46c4878 100644
--- a/paddle/fluid/inference/api/resource_manager.h
+++ b/paddle/fluid/inference/api/resource_manager.h
@@ -25,7 +25,8 @@
 #include "paddle/phi/common/place.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/device/gpu/gpu_types.h"
 #include "paddle/phi/backends/gpu/forwards.h"
 #include "paddle/phi/backends/gpu/gpu_decls.h"
@@ -49,7 +50,8 @@ class CPUContextResource {
   std::unique_ptr<Eigen::DefaultDevice> cpu_eigen_device_;
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 class GPUContextResource {
  public:
   explicit GPUContextResource(const phi::Place& place, void* stream);
@@ -149,7 +151,8 @@ class ResourceManager {
   std::mutex cpu_mutex_;
   std::unique_ptr<CPUContextResource> cpu_resource_{nullptr};
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   // GPU Resource
  public:
   void* InitGPUResource(const phi::Place& place, void* stream);
diff --git a/paddle/fluid/inference/lite/tensor_utils.cc b/paddle/fluid/inference/lite/tensor_utils.cc
index 509b3f0b993f8..edf34de39f4e6 100644
--- a/paddle/fluid/inference/lite/tensor_utils.cc
+++ b/paddle/fluid/inference/lite/tensor_utils.cc
@@ -127,7 +127,8 @@ void MemoryCopyAsync(const platform::Place& dst_place,
   if (platform::is_cpu_place(dst_place) && platform::is_cpu_place(src_place)) {
     memory::Copy(cpu_place, dst_data, cpu_place, src_data, size);
   } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     if (platform::is_cpu_place(dst_place) &&
         platform::is_gpu_place(src_place)) {
       PADDLE_THROW(platform::errors::Unimplemented(
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index aa96228a694d5..1b01636672956 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -19,7 +19,9 @@ set(ALLOCATOR_SRCS
     buddy_allocator.cc
     system_allocator.cc)
 
-if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
+if(WITH_GPU
+   OR WITH_ROCM
+   OR WITH_MUSA)
   list(
     APPEND
     ALLOCATOR_SRCS
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 1d3cc8b260caa..6ba81821871f8 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -27,7 +27,8 @@
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/core/macros.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include <shared_mutex>
 
 #include "paddle/fluid/memory/allocation/cuda_allocator.h"
@@ -164,7 +165,8 @@ class AllocatorFacadePrivate {
  public:
   using AllocatorMap = std::map<platform::Place, std::shared_ptr<Allocator>>;
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   using CUDAAllocatorMap =
       std::map<platform::CUDAPlace,
                std::map<gpuStream_t, std::shared_ptr<Allocator>>>;
@@ -187,7 +189,8 @@ class AllocatorFacadePrivate {
           InitNaiveBestFitIPUAllocator(platform::IPUPlace(dev_id));
         }
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
         for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
           InitNaiveBestFitCUDAAllocator(platform::CUDAPlace(dev_id));
         }
@@ -214,7 +217,8 @@ class AllocatorFacadePrivate {
 
       case AllocatorStrategy::kAutoGrowth: {
         InitNaiveBestFitCPUAllocator();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
         allow_free_idle_chunk_ = allow_free_idle_chunk;
         for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
           InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id),
@@ -286,7 +290,8 @@ class AllocatorFacadePrivate {
           InitNaiveBestFitIPUAllocator(platform::IPUPlace(dev_id));
         }
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
         for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
           InitThreadLocalCUDAAllocator(platform::CUDAPlace(dev_id));
         }
@@ -345,7 +350,8 @@ class AllocatorFacadePrivate {
            LIKELY(FLAGS_use_system_allocator == false);
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   bool HasCUDAAllocator(const platform::CUDAPlace& place, gpuStream_t stream) {
     auto it = cuda_allocators_.find(place);
     if (it == cuda_allocators_.end()) {
@@ -594,7 +600,8 @@ class AllocatorFacadePrivate {
 #endif
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   void InitNaiveBestFitCUDAPinnedAllocator() {
     allocators_[platform::CUDAPinnedPlace()] =
         std::make_shared<NaiveBestFitAllocator>(platform::CUDAPinnedPlace());
@@ -1038,7 +1045,8 @@ class AllocatorFacadePrivate {
       system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
     }
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     system_allocators_[platform::CUDAPinnedPlace()] =
         std::make_shared<CPUPinnedAllocator>();
     int device_count = platform::GetGPUDeviceCount();
@@ -1064,7 +1072,8 @@ class AllocatorFacadePrivate {
     if (!zero_size_allocators_.empty()) return;
     std::vector<platform::Place> places;
     places.emplace_back(platform::CPUPlace());
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     int device_count = platform::GetGPUDeviceCount();
     for (int dev_id = 0; dev_id < device_count; ++dev_id) {
       places.emplace_back(platform::CUDAPlace(dev_id));
@@ -1112,7 +1121,8 @@ class AllocatorFacadePrivate {
     CheckAllocThreadSafe(allocators_);
     CheckAllocThreadSafe(zero_size_allocators_);
     CheckAllocThreadSafe(system_allocators_);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     if (is_stream_safe_cuda_allocator_used_) {
       CheckCUDAAllocThreadSafe(cuda_allocators_);
     }
@@ -1145,7 +1155,8 @@ class AllocatorFacadePrivate {
     }
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   // a standalone CUDA allocator to support multi-stream GC in new executor
   std::map<platform::Place, std::shared_ptr<StreamSafeCUDAAllocator>>
       default_stream_safe_cuda_allocators_;
@@ -1252,7 +1263,8 @@ std::shared_ptr<phi::Allocation> AllocatorFacade::AllocShared(
 AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
                                      size_t size,
                                      const phi::Stream& stream) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   AllocatorFacadePrivate* m = GetPrivate();
   if (!m->IsStreamSafeCUDAAllocatorUsed()) {
     VLOG(6) << "Warning: StreamSafeCUDAAllocator is not used!";
@@ -1278,7 +1290,8 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
 bool AllocatorFacade::InSameStream(
     const std::shared_ptr<phi::Allocation>& allocation,
     const phi::Stream& stream) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id());
   return s == GetStream(allocation);
 #else
@@ -1290,7 +1303,8 @@ bool AllocatorFacade::IsStreamSafeCUDAAllocatorUsed() {
   return GetPrivate()->IsStreamSafeCUDAAllocatorUsed();
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place,
                                   gpuStream_t stream) {
   AllocatorFacadePrivate* m = GetPrivate();
diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h
index 6f1b495891338..92bbc03378be2 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.h
+++ b/paddle/fluid/memory/allocation/allocator_facade.h
@@ -76,7 +76,8 @@ class AllocatorFacade {
 
   bool IsStreamSafeCUDAAllocatorUsed();
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   // TODO(zhiqiu): change gpuStream_t to phi::Stream if needed.
   uint64_t Release(const platform::CUDAPlace& place, gpuStream_t stream);
   void RecordStream(std::shared_ptr<Allocation> allocation, gpuStream_t stream);
diff --git a/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc b/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc
index 0cf8089f5a65f..fe905932c626b 100644
--- a/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc
@@ -17,7 +17,8 @@
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/core/flags.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PHI_DECLARE_double(fraction_of_gpu_memory_to_use);
 PHI_DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
 PHI_DECLARE_uint64(initial_gpu_memory_in_mb);
@@ -46,7 +47,8 @@ void AllocateTestCases() {
     ASSERT_EQ(cpu_allocation->size(), size);
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   {
     place = platform::CUDAPlace(0);
     size = 1024;
@@ -82,7 +84,8 @@ void AllocateTestCases() {
 }
 
 TEST(Allocator, SpecifyGpuMemory) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   // Set to 0.0 to test FLAGS_initial_gpu_memory_in_mb and
   // FLAGS_reallocate_gpu_memory_in_mb
   FLAGS_fraction_of_gpu_memory_to_use = 0.0;
diff --git a/paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc b/paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc
index b60b53bc28f3c..b88c952243a06 100644
--- a/paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc
@@ -17,7 +17,8 @@
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/core/flags.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PHI_DECLARE_double(fraction_of_gpu_memory_to_use);
 PHI_DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
 PHI_DECLARE_uint64(initial_gpu_memory_in_mb);
@@ -46,7 +47,8 @@ void AllocateTestCases() {
     ASSERT_EQ(cpu_allocation->size(), size);
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   {
     place = platform::CUDAPlace(0);
     size = 1024;
@@ -82,7 +84,8 @@ void AllocateTestCases() {
 }
 
 TEST(Allocator, Allocator) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   FLAGS_fraction_of_gpu_memory_to_use = 0.01;
   FLAGS_gpu_allocator_retry_time = 500;
   FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc
index b4d4699f1f039..47a4be778819b 100644
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc
@@ -23,7 +23,8 @@
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/phi/core/flags.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PHI_DECLARE_double(fraction_of_gpu_memory_to_use);
 PHI_DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
 DECLARE_int64(gpu_allocator_retry_time);
@@ -41,7 +42,8 @@ static inline size_t AlignTo(size_t size, size_t alignment) {
 }
 
 TEST(allocator, allocator) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   FLAGS_fraction_of_gpu_memory_to_use = 0.01;
   FLAGS_gpu_allocator_retry_time = 500;
   FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
@@ -102,7 +104,8 @@ TEST(allocator, allocator) {
 
 TEST(multithread_allocate, test_segfault) {
   FLAGS_allocator_strategy = "auto_growth";
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   std::mutex mtx;
   std::condition_variable cv;
   bool flag = false;
diff --git a/paddle/fluid/memory/allocation/buddy_allocator.cc b/paddle/fluid/memory/allocation/buddy_allocator.cc
index 9c1402374b323..8089f21a3619f 100644
--- a/paddle/fluid/memory/allocation/buddy_allocator.cc
+++ b/paddle/fluid/memory/allocation/buddy_allocator.cc
@@ -19,7 +19,8 @@ limitations under the License. */
 #include "glog/logging.h"
 #include "paddle/phi/core/flags.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #define USE_DEVICE
 PHI_DECLARE_uint64(reallocate_gpu_memory_in_mb);
 #endif
@@ -53,7 +54,8 @@ BuddyAllocator::BuddyAllocator(
           platform::PlaceHelper::CreatePlace(dev_type));
     };
   } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     init_allocate_size_func_ = &platform::GpuInitAllocSize;
     re_allocate_size_func_ = &platform::GpuReallocSize;
 #endif
@@ -249,7 +251,8 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
   allocate_bytes = DeviceAllocateSize(
       init_allocate_size_func_, re_allocate_size_func_, request_bytes);
 #else
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   allocate_bytes = DeviceAllocateSize(
       &platform::GpuInitAllocSize, &platform::GpuReallocSize, request_bytes);
 #endif
diff --git a/paddle/fluid/memory/allocation/buddy_allocator_test.cc b/paddle/fluid/memory/allocation/buddy_allocator_test.cc
index 6b99499824cfb..84ca071b800a0 100644
--- a/paddle/fluid/memory/allocation/buddy_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/buddy_allocator_test.cc
@@ -26,7 +26,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/phi/core/flags.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PHI_DECLARE_double(fraction_of_gpu_memory_to_use);
 PHI_DECLARE_uint64(initial_gpu_memory_in_mb);
 PHI_DECLARE_uint64(reallocate_gpu_memory_in_mb);
@@ -77,7 +78,8 @@ int* TestBuddyAllocator(BuddyAllocator* allocator,
   return nullptr;
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 TEST(BuddyAllocator, GpuFraction) {
   // In a 16 GB machine, the pool size will be about 160 MB
   FLAGS_fraction_of_gpu_memory_to_use = 0.01;
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
index 93ebf7a1af16b..e4b0273a6efc3 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -26,7 +26,8 @@
 #include "paddle/fluid/string/printf.h"
 #include "paddle/fluid/string/split.h"
 #include "paddle/phi/common/place.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
 #include "paddle/fluid/platform/flags.h"
@@ -213,7 +214,8 @@ size_t Used<platform::XPUPlace>(const platform::XPUPlace &place) {
 }
 
 // For CUDA
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 class GPUBuddyAllocatorList {
  private:
   GPUBuddyAllocatorList() : devices_(platform::GetSelectedDevices()) {
@@ -283,7 +285,8 @@ BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) {
 
 template <>
 size_t Used<platform::CUDAPlace>(const platform::CUDAPlace &place) {
-#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || defined PADDLE_WITH_MUSA)
+#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || \
+     defined PADDLE_WITH_MUSA)
   return GetGPUBuddyAllocator(place.device)->Used();
 #else
   PADDLE_THROW(platform::errors::PermissionDenied(
@@ -294,7 +297,8 @@ size_t Used<platform::CUDAPlace>(const platform::CUDAPlace &place) {
 template <>
 void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
                                  size_t size) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   auto *buddy_allocator = GetGPUBuddyAllocator(place.device);
   auto *ptr = buddy_allocator->Alloc(size);
   if (ptr == nullptr) {
@@ -333,7 +337,8 @@ template <>
 void Free<platform::CUDAPlace>(const platform::CUDAPlace &place,
                                void *p,
                                size_t size) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   GetGPUBuddyAllocator(place.device)->Free(p);
 #else
   PADDLE_THROW(platform::errors::PermissionDenied(
@@ -343,7 +348,8 @@ void Free<platform::CUDAPlace>(const platform::CUDAPlace &place,
 
 template <>
 uint64_t Release<platform::CUDAPlace>(const platform::CUDAPlace &place) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   return GetGPUBuddyAllocator(place.device)->Release();
 #else
   PADDLE_THROW(platform::errors::PermissionDenied(
@@ -351,7 +357,8 @@ uint64_t Release<platform::CUDAPlace>(const platform::CUDAPlace &place) {
 #endif
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 BuddyAllocator *GetCUDAPinnedBuddyAllocator() {
   static std::once_flag init_flag;
   static BuddyAllocator *ba = nullptr;
@@ -369,7 +376,8 @@ BuddyAllocator *GetCUDAPinnedBuddyAllocator() {
 
 template <>
 size_t Used<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   return GetCUDAPinnedBuddyAllocator()->Used();
 #else
   PADDLE_THROW(platform::errors::PermissionDenied(
@@ -380,7 +388,8 @@ size_t Used<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place) {
 template <>
 void *Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
                                        size_t size) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
   auto *buddy_allocator = GetCUDAPinnedBuddyAllocator();
   void *ptr = buddy_allocator->Alloc(size);
@@ -403,7 +412,8 @@ template <>
 void Free<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
                                      void *p,
                                      size_t size) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   VLOG(10) << "Free " << size << " bytes on " << platform::Place(place);
   GetCUDAPinnedBuddyAllocator()->Free(p);
 #else
@@ -415,7 +425,8 @@ void Free<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
 template <>
 uint64_t Release<platform::CUDAPinnedPlace>(
     const platform::CUDAPinnedPlace &place) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   VLOG(10) << "Release on " << platform::Place(place);
   return GetCUDAPinnedBuddyAllocator()->Release();
 #else
@@ -604,7 +615,8 @@ size_t Usage::operator()(const platform::CPUPlace &cpu) const {
 }
 
 size_t Usage::operator()(const platform::CUDAPlace &gpu) const {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   return Used(gpu);
 #else
   PADDLE_THROW(platform::errors::PermissionDenied(
@@ -613,7 +625,8 @@ size_t Usage::operator()(const platform::CUDAPlace &gpu) const {
 }
 
 size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   return Used(cuda_pinned);
 #else
   PADDLE_THROW(platform::errors::PermissionDenied(
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc
index 5ad4a729a6692..b6be358fde05c 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc
@@ -33,7 +33,8 @@ TEST(NaiveBestFitAllocatorTest, CpuAlloc) {
   alloc.Release(platform::CPUPlace());
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 TEST(NaiveBestFitAllocatorTest, GpuAlloc) {
   NaiveBestFitAllocator alloc{platform::CUDAPlace(0)};
   {
diff --git a/paddle/fluid/memory/allocation/retry_allocator_test.cc b/paddle/fluid/memory/allocation/retry_allocator_test.cc
index ef8692b64cc51..115fa600ad972 100644
--- a/paddle/fluid/memory/allocation/retry_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/retry_allocator_test.cc
@@ -19,7 +19,8 @@
 #include "gtest/gtest.h"
 #include "paddle/fluid/memory/allocation/best_fit_allocator.h"
 #include "paddle/fluid/memory/allocation/cpu_allocator.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/memory/allocation/cuda_allocator.h"
 #endif
 
@@ -114,7 +115,8 @@ TEST(RetryAllocator, RetryAllocatorLastAllocFailure) {
     }
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   {
     platform::CUDAPlace p(0);
     RetryAllocator allocator(std::make_shared<CUDAAllocator>(p), retry_ms);
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
index 0ab0e932cc6f9..efa0e8393aa20 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
@@ -16,8 +16,8 @@
 
 #include <list>
 #include <map>
-#include <set>
 #include <mutex>
+#include <set>
 
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/memory/allocation/spin_lock.h"
diff --git a/paddle/fluid/memory/allocation/system_allocator.cc b/paddle/fluid/memory/allocation/system_allocator.cc
index a3017ba082cc1..0ef6b35f8cdac 100644
--- a/paddle/fluid/memory/allocation/system_allocator.cc
+++ b/paddle/fluid/memory/allocation/system_allocator.cc
@@ -33,7 +33,8 @@ limitations under the License. */
 #include "paddle/phi/backends/cpu/cpu_info.h"
 #include "paddle/phi/core/flags.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
 
@@ -120,7 +121,8 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) {
 
 bool CPUAllocator::UseGpu() const { return false; }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
 void* GPUAllocator::Alloc(size_t* index, size_t size) {
   // CUDA documentation doesn't explain if cudaMalloc returns nullptr
diff --git a/paddle/fluid/memory/allocation/system_allocator.h b/paddle/fluid/memory/allocation/system_allocator.h
index b2cce04a04d37..cde4743e620a9 100644
--- a/paddle/fluid/memory/allocation/system_allocator.h
+++ b/paddle/fluid/memory/allocation/system_allocator.h
@@ -43,7 +43,8 @@ class CPUAllocator : public SystemAllocator {
   virtual bool UseGpu() const;
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 class GPUAllocator : public SystemAllocator {
  public:
   explicit GPUAllocator(int gpu_id) : gpu_id_(gpu_id) {}
diff --git a/paddle/fluid/memory/allocation/system_allocator_test.cc b/paddle/fluid/memory/allocation/system_allocator_test.cc
index a296755c12725..d6a203ef38f47 100644
--- a/paddle/fluid/memory/allocation/system_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/system_allocator_test.cc
@@ -57,7 +57,8 @@ TEST(CPUAllocator, LockMem) {
   TestAllocator(&a, 0);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 TEST(GPUAllocator, Alloc) {
   paddle::memory::detail::GPUAllocator a(0);
   TestAllocator(&a, 2048);
diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc
index f86d4f0f256ca..01220b0e44240 100644
--- a/paddle/fluid/memory/malloc.cc
+++ b/paddle/fluid/memory/malloc.cc
@@ -57,7 +57,8 @@ void* GetBasePtr(const std::shared_ptr<Allocation>& allocation) {
   return allocation::AllocatorFacade::Instance().GetBasePtr(allocation);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 uint64_t Release(const platform::CUDAPlace& place, gpuStream_t stream) {
   return allocation::AllocatorFacade::Instance().Release(place, stream);
 }
diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h
index bd67a4eeefcac..2e029c4ebae88 100644
--- a/paddle/fluid/memory/malloc.h
+++ b/paddle/fluid/memory/malloc.h
@@ -48,7 +48,8 @@ extern bool InSameStream(const std::shared_ptr<Allocation>& allocation,
 
 extern void* GetBasePtr(const std::shared_ptr<Allocation>& allocation);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 extern uint64_t Release(const platform::CUDAPlace& place, gpuStream_t stream);
 
 void RecordStream(std::shared_ptr<Allocation> allocation, gpuStream_t stream);
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index b87cff7a7a429..c8ce60e7c39d6 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -256,7 +256,8 @@ void Copy<phi::Place, phi::XPUPlace>(phi::Place dst_place,
 
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024;  // 64K
 
 #ifdef PADDLE_WITH_HIP
@@ -798,7 +799,8 @@ void Copy<phi::Place, phi::Place>(phi::Place dst_place,
       dst_place.GetType() == phi::AllocationType::CPU) {
     std::memcpy(dst, src, num);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   else if (src_place.GetType() == phi::AllocationType::CPU &&  // NOLINT
            dst_place.GetType() == phi::AllocationType::GPUPINNED) {
     std::memcpy(dst, src, num);
diff --git a/paddle/fluid/memory/memory_stats_test.cc b/paddle/fluid/memory/memory_stats_test.cc
index 6afc2a852f0d6..e51859e791a08 100644
--- a/paddle/fluid/memory/memory_stats_test.cc
+++ b/paddle/fluid/memory/memory_stats_test.cc
@@ -40,7 +40,8 @@ TEST(stat_allocator_test, host_memory_stat_test) {
   EXPECT_EQ(HostMemoryStatPeakValue("Allocated", 0), max_alloc_size);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 TEST(stat_allocator_test, device_memory_stat_test) {
   std::vector<int64_t> alloc_sizes{
       5278, 9593, 8492, 5041, 3351, 4232, 3706, 5963, 5896, 5057, 7527,
diff --git a/paddle/fluid/operators/array_to_lod_tensor_op.cc b/paddle/fluid/operators/array_to_lod_tensor_op.cc
index fde5de90c56dc..8cdb244d8af6f 100644
--- a/paddle/fluid/operators/array_to_lod_tensor_op.cc
+++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc
@@ -55,7 +55,8 @@ struct ArrayToLoDFunctor {
     if (std::is_same<Place, platform::CPUPlace>::value) {
       Apply(static_cast<phi::CPUContext *>(pool.Get(place)));
     } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       Apply(static_cast<phi::GPUContext *>(pool.Get(place)));
 #else
       PADDLE_THROW(
diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op.h b/paddle/fluid/operators/collective/c_sync_calc_stream_op.h
index 79c32bc907045..9e562cbf58dfe 100644
--- a/paddle/fluid/operators/collective/c_sync_calc_stream_op.h
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.h
@@ -39,7 +39,9 @@ template <typename T, typename DeviceContext>
 class CSyncCalcStreamKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && !defined(_WIN32)
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+     defined(PADDLE_WITH_MUSA)) &&                            \
+    !defined(_WIN32)
 
     auto place = ctx.GetPlace();
     auto dev_ctx = static_cast<phi::GPUContext*>(
diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.h b/paddle/fluid/operators/controlflow/conditional_block_op.h
index d5419d2b13a4e..7db9932d99a98 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_op.h
+++ b/paddle/fluid/operators/controlflow/conditional_block_op.h
@@ -77,7 +77,8 @@ class ConditionalOp : public framework::OperatorBase {
                           ips[0]->numel()));
     bool res = false;
     if (platform::is_gpu_place(ips[0]->place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       phi::DenseTensor cpu_tensor;
       framework::TensorCopy(*ips[0], platform::CPUPlace(), &cpu_tensor);
       platform::DeviceContextPool::Instance().Get(ips[0]->place())->Wait();
diff --git a/paddle/fluid/operators/controlflow/feed_op.cc b/paddle/fluid/operators/controlflow/feed_op.cc
index 6b85a1d08657b..e0748c008a564 100644
--- a/paddle/fluid/operators/controlflow/feed_op.cc
+++ b/paddle/fluid/operators/controlflow/feed_op.cc
@@ -218,7 +218,8 @@ PD_REGISTER_KERNEL_FOR_ALL_DTYPE(
     ALL_LAYOUT,
     paddle::operators::FeedSparseCooTensorKernel<phi::CPUContext>) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL_FOR_ALL_DTYPE(
     feed_sparse_coo_tensor,
     GPU,
diff --git a/paddle/fluid/operators/controlflow/get_places_op.cc b/paddle/fluid/operators/controlflow/get_places_op.cc
index 1074c1c30f676..9bbe605c8ccb6 100644
--- a/paddle/fluid/operators/controlflow/get_places_op.cc
+++ b/paddle/fluid/operators/controlflow/get_places_op.cc
@@ -26,7 +26,8 @@ namespace imperative {
 class OpBase;
 }  // namespace imperative
 }  // namespace paddle
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
 
@@ -34,7 +35,8 @@ namespace paddle {
 namespace operators {
 
 static size_t CUDADevCount() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   return platform::GetGPUDeviceCount();
 #else
   return 0UL;
diff --git a/paddle/fluid/operators/controlflow/while_op_helper.cc b/paddle/fluid/operators/controlflow/while_op_helper.cc
index 790f54612ffae..3d25edfe2e130 100644
--- a/paddle/fluid/operators/controlflow/while_op_helper.cc
+++ b/paddle/fluid/operators/controlflow/while_op_helper.cc
@@ -227,8 +227,9 @@ bool GetCondData(const phi::DenseTensor &cond) {
   // when platform::is_gpu_place(cond.place()) or
   // platform::is_xpu_place(cond.place()) is true
   std::unique_ptr<phi::DenseTensor> cpu_cond{new phi::DenseTensor()};
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
-    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_XPU) || \
+    defined(PADDLE_WITH_CUSTOM_DEVICE)
   framework::TensorCopySync(cond, platform::CPUPlace(), cpu_cond.get());
 #else
   PADDLE_THROW(platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/operators/detection/target_assign_op.h b/paddle/fluid/operators/detection/target_assign_op.h
index 0fd2a6883943b..b01813a0cfc27 100644
--- a/paddle/fluid/operators/detection/target_assign_op.h
+++ b/paddle/fluid/operators/detection/target_assign_op.h
@@ -120,7 +120,8 @@ class TargetAssignKernel : public framework::OpKernel<T> {
     int64_t k = x->dims()[2];
 
     auto x_lod = x->lod().back();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     phi::MixVector<size_t> mixv_x_lod(&x_lod);
     size_t* x_lod_data = mixv_x_lod.MutableData(ctx.GetPlace());
 #else
@@ -137,7 +138,8 @@ class TargetAssignKernel : public framework::OpKernel<T> {
                                        k,
                                        out_data,
                                        out_wt_data);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     mixv_x_lod.CopyToCPU();
 #endif
 
@@ -154,7 +156,8 @@ class TargetAssignKernel : public framework::OpKernel<T> {
               "TargetAssignOp input(NegIndices) needs 1 level of LoD"));
       const int* neg_idx_data = neg_indices->data<int>();
       auto neg_lod = neg_indices->lod().back();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       phi::MixVector<size_t> mixv_neg_lod(&neg_lod);
       size_t* neg_lod_data = mixv_neg_lod.MutableData(ctx.GetPlace());
 #else
@@ -170,7 +173,8 @@ class TargetAssignKernel : public framework::OpKernel<T> {
                       mismatch_value,
                       out_data,
                       out_wt_data);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       mixv_neg_lod.CopyToCPU();
 #endif
     }
diff --git a/paddle/fluid/operators/dgc_op.h b/paddle/fluid/operators/dgc_op.h
index f1d37e447991c..c0799c4c861c4 100644
--- a/paddle/fluid/operators/dgc_op.h
+++ b/paddle/fluid/operators/dgc_op.h
@@ -188,7 +188,8 @@ class DGCOpKernel : public framework::OpKernel<T> {
 
     int buf_size = paddle::communication::dgc::get_buffer_size(k);
     paddle::memory::allocation::AllocationPtr tmp_ious_data;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     if (platform::is_gpu_place(dev_ctx.GetPlace())) {
       tmp_ious_data = memory::Alloc(
           dev_ctx.GetPlace(),
diff --git a/paddle/fluid/operators/expand_as_op.cc b/paddle/fluid/operators/expand_as_op.cc
index f0d31269da193..82195e874f1b6 100644
--- a/paddle/fluid/operators/expand_as_op.cc
+++ b/paddle/fluid/operators/expand_as_op.cc
@@ -155,7 +155,8 @@ REGISTER_OP_CPU_KERNEL(expand_as_grad,
                        ops::ExpandAsGradKernel<phi::CPUContext, int64_t>,
                        ops::ExpandAsGradKernel<phi::CPUContext, float>,
                        ops::ExpandAsGradKernel<phi::CPUContext, double>);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 REGISTER_OP_CUDA_KERNEL(expand_as,
                         ops::ExpandAsKernel<phi::GPUContext, float>,
                         ops::ExpandAsKernel<phi::GPUContext, double>,
diff --git a/paddle/fluid/operators/expand_op.cc b/paddle/fluid/operators/expand_op.cc
index 490c6f9f6dbfc..5cb29c1d48dad 100644
--- a/paddle/fluid/operators/expand_op.cc
+++ b/paddle/fluid/operators/expand_op.cc
@@ -283,7 +283,8 @@ REGISTER_OP_CPU_KERNEL(expand_grad,
                        ops::ExpandGradKernel<phi::CPUContext, double>,
                        ops::ExpandGradKernel<phi::CPUContext, int>,
                        ops::ExpandGradKernel<phi::CPUContext, int64_t>);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 REGISTER_OP_CUDA_KERNEL(
     expand,
     ops::ExpandKernel<phi::GPUContext, float>,
diff --git a/paddle/fluid/operators/fused/yolo_box_post_op.cu b/paddle/fluid/operators/fused/yolo_box_post_op.cu
index 902c73ff2cc15..72077400d5d2a 100644
--- a/paddle/fluid/operators/fused/yolo_box_post_op.cu
+++ b/paddle/fluid/operators/fused/yolo_box_post_op.cu
@@ -370,11 +370,11 @@ class YoloBoxPostKernel : public framework::OpKernel<T> {
               hipMemcpyHostToDevice);
 #elif defined(PADDLE_WITH_MUSA)
     musaMalloc(reinterpret_cast<void**>(&device_anchors),
-              anchors.size() * sizeof(int));
+               anchors.size() * sizeof(int));
     musaMemcpy(device_anchors,
-              anchors.data(),
-              anchors.size() * sizeof(int),
-              musaMemcpyHostToDevice);
+               anchors.data(),
+               anchors.size() * sizeof(int),
+               musaMemcpyHostToDevice);
 #else
     cudaMalloc(reinterpret_cast<void**>(&device_anchors),
                anchors.size() * sizeof(int));
@@ -423,7 +423,7 @@ class YoloBoxPostKernel : public framework::OpKernel<T> {
                 sizeof(int));
 #elif defined(PADDLE_WITH_MUSA)
       musaMalloc(reinterpret_cast<void**>(&ts_info[i].bbox_count_device_ptr),
-                sizeof(int));
+                 sizeof(int));
 #else
       cudaMalloc(reinterpret_cast<void**>(&ts_info[i].bbox_count_device_ptr),
                  sizeof(int));
diff --git a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu
index 35d69faa1a41d..4065fd1e017ea 100644
--- a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu
+++ b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu
@@ -32,8 +32,8 @@ limitations under the License. */
 #include <curand_kernel.h>
 #endif
 #ifdef PADDLE_WITH_MUSA
-#include <musa.h>
 #include <murand_kernel.h>
+#include <musa.h>
 #endif
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
diff --git a/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc
index c6a8a4fe7b982..853540f7a2b9b 100644
--- a/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc
+++ b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc
@@ -111,7 +111,8 @@ PD_REGISTER_STRUCT_KERNEL(get_tensor_from_selected_rows,
                           int,
                           int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_STRUCT_KERNEL(get_tensor_from_selected_rows,
                           GPU,
                           ALL_LAYOUT,
diff --git a/paddle/fluid/operators/graph_khop_sampler_op.cu b/paddle/fluid/operators/graph_khop_sampler_op.cu
index c74ce6b5f4691..0e96f7164e913 100644
--- a/paddle/fluid/operators/graph_khop_sampler_op.cu
+++ b/paddle/fluid/operators/graph_khop_sampler_op.cu
@@ -33,8 +33,8 @@ limitations under the License. */
 #include <hip/hip_runtime.h>
 #include <hiprand_kernel.h>
 #elif defined(PADDLE_WITH_MUSA)
-#include <musa_runtime.h>
 #include <murand_kernel.h>
+#include <musa_runtime.h>
 #else
 #include <cuda_runtime.h>
 #include <curand_kernel.h>
diff --git a/paddle/fluid/operators/hinge_loss_op.cc b/paddle/fluid/operators/hinge_loss_op.cc
index ea38db87e63e7..01d9642a49404 100644
--- a/paddle/fluid/operators/hinge_loss_op.cc
+++ b/paddle/fluid/operators/hinge_loss_op.cc
@@ -156,7 +156,8 @@ PD_REGISTER_STRUCT_KERNEL(
 PD_REGISTER_STRUCT_KERNEL(
     hinge_loss_grad, CPU, ALL_LAYOUT, ops::HingeLossGradKernel, float) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_STRUCT_KERNEL(
     hinge_loss, GPU, ALL_LAYOUT, ops::HingeLossKernel, float) {}
 PD_REGISTER_STRUCT_KERNEL(
diff --git a/paddle/fluid/operators/im2sequence_op.cc b/paddle/fluid/operators/im2sequence_op.cc
index e1e9ca5ef6667..56fa8cfc4b0cd 100644
--- a/paddle/fluid/operators/im2sequence_op.cc
+++ b/paddle/fluid/operators/im2sequence_op.cc
@@ -201,7 +201,8 @@ PD_REGISTER_STRUCT_KERNEL(
 PD_REGISTER_STRUCT_KERNEL(
     im2sequence_grad, CPU, ALL_LAYOUT, ops::Im2SequenceGradKernel, float) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_STRUCT_KERNEL(
     im2sequence, GPU, ALL_LAYOUT, ops::Im2SequenceKernel, float) {}
 PD_REGISTER_STRUCT_KERNEL(
diff --git a/paddle/fluid/operators/isfinite_op.h b/paddle/fluid/operators/isfinite_op.h
index 940b3eaac0c10..b6a8b52c04083 100644
--- a/paddle/fluid/operators/isfinite_op.h
+++ b/paddle/fluid/operators/isfinite_op.h
@@ -67,7 +67,8 @@ bool TensorIsfinite(const phi::DenseTensor& tensor);
 FiniteVisitor(Isnan, Any, CPU);
 FiniteVisitor(Isinf, Any, CPU);
 FiniteVisitor(Isfinite, All, CPU);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 FiniteVisitor(Isnan, Any, GPU);
 FiniteVisitor(Isinf, Any, GPU);
 FiniteVisitor(Isfinite, All, GPU);
@@ -82,7 +83,8 @@ inline void TensorContainsNAN(const phi::DenseTensor& tensor,
                         IsnanVisitorCPU(tensor, out));
     return;
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (platform::is_gpu_place(place)) {
     VisitDataTypeNormal(paddle::framework::TransToProtoVarType(tensor.dtype()),
                         IsnanVisitorGPU(tensor, out));
@@ -99,7 +101,8 @@ inline void TensorContainsInf(const phi::DenseTensor& tensor,
                         IsinfVisitorCPU(tensor, out));
     return;
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (platform::is_gpu_place(place)) {
     VisitDataTypeNormal(paddle::framework::TransToProtoVarType(tensor.dtype()),
                         IsinfVisitorGPU(tensor, out));
@@ -116,7 +119,8 @@ inline void TensorIsfinite(const phi::DenseTensor& tensor,
                         IsfiniteVisitorCPU(tensor, out));
     return;
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (platform::is_gpu_place(place)) {
     VisitDataTypeNormal(paddle::framework::TransToProtoVarType(tensor.dtype()),
                         IsfiniteVisitorGPU(tensor, out));
diff --git a/paddle/fluid/operators/l1_norm_op.cc b/paddle/fluid/operators/l1_norm_op.cc
index 2c6d72f109c13..c859183fd9661 100644
--- a/paddle/fluid/operators/l1_norm_op.cc
+++ b/paddle/fluid/operators/l1_norm_op.cc
@@ -96,7 +96,8 @@ PD_REGISTER_STRUCT_KERNEL(l1_norm, CPU, ALL_LAYOUT, ops::L1NormKernel, float) {}
 PD_REGISTER_STRUCT_KERNEL(
     l1_norm_grad, CPU, ALL_LAYOUT, ops::L1NormGradKernel, float) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_STRUCT_KERNEL(l1_norm, GPU, ALL_LAYOUT, ops::L1NormKernel, float) {}
 PD_REGISTER_STRUCT_KERNEL(
     l1_norm_grad, GPU, ALL_LAYOUT, ops::L1NormGradKernel, float) {}
diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc
index 197aaa74bb3e1..fc3845703bef4 100644
--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
@@ -133,7 +133,8 @@ PD_REGISTER_KERNEL(load, CPU, ALL_LAYOUT, ops::LoadKernel, float) {}
 PD_REGISTER_KERNEL(
     load_sr, CPU, ALL_LAYOUT, ops::LoadSelectedRowsKernel, float) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(load, GPU, ALL_LAYOUT, ops::LoadKernel, float) {}
 PD_REGISTER_KERNEL(
     load_sr, GPU, ALL_LAYOUT, ops::LoadSelectedRowsKernel, float) {}
diff --git a/paddle/fluid/operators/lod_tensor_to_array_op.cc b/paddle/fluid/operators/lod_tensor_to_array_op.cc
index da8ea875e9393..05a6a5c86831c 100644
--- a/paddle/fluid/operators/lod_tensor_to_array_op.cc
+++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc
@@ -66,7 +66,8 @@ struct LoDTensorToArrayFunctor {
     if (std::is_same<Place, platform::CPUPlace>::value) {
       Apply(static_cast<phi::CPUContext *>(dev_ctx));
     } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       Apply(static_cast<phi::GPUContext *>(dev_ctx));
 #else
       PADDLE_THROW(
diff --git a/paddle/fluid/operators/math/bert_encoder_functor.h b/paddle/fluid/operators/math/bert_encoder_functor.h
index b400ffb6a9cc4..91fdcf82e83d0 100644
--- a/paddle/fluid/operators/math/bert_encoder_functor.h
+++ b/paddle/fluid/operators/math/bert_encoder_functor.h
@@ -51,7 +51,8 @@ struct CUDATypeTraits<float> {
   typedef float TYPE;
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 // This functor involves a fusion calculation in Ernie or Bert.
 //  The fusion mode is as follows:
 //
diff --git a/paddle/fluid/operators/math/prelu.h b/paddle/fluid/operators/math/prelu.h
index 1762353abaa9f..04e390499cb7f 100644
--- a/paddle/fluid/operators/math/prelu.h
+++ b/paddle/fluid/operators/math/prelu.h
@@ -23,7 +23,8 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 template <typename T>
 class PreluChannelWiseDirectCUDAFunctor {
  public:
diff --git a/paddle/fluid/operators/math/sample_prob.h b/paddle/fluid/operators/math/sample_prob.h
index b7b224a0baaf5..d1487d9c57360 100644
--- a/paddle/fluid/operators/math/sample_prob.h
+++ b/paddle/fluid/operators/math/sample_prob.h
@@ -106,7 +106,8 @@ class SampleWithProb {
   }
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 template <typename T>
 class GPUSampleWithProb {
  public:
diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc
index aa08a7b6258b9..af41335bffa86 100644
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -926,7 +926,8 @@ REGISTER_OP_CPU_KERNEL(matmul_grad_grad,
                        ops::MatMulDoubleGradKernel<phi::CPUContext, float>,
                        ops::MatMulDoubleGradKernel<phi::CPUContext, double>);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 REGISTER_OP_CUDA_KERNEL(
     matmul,
     ops::MatMulKernel<phi::GPUContext, float>,
diff --git a/paddle/fluid/operators/memcpy_h2d_op.h b/paddle/fluid/operators/memcpy_h2d_op.h
index a4b6e061bfdff..fff8b36d68405 100644
--- a/paddle/fluid/operators/memcpy_h2d_op.h
+++ b/paddle/fluid/operators/memcpy_h2d_op.h
@@ -39,7 +39,8 @@ class MemcpyH2DFunctor {
 
   void operator()(const phi::DenseTensor &lod_tensor) const {
     auto &out_tensor = *out_->GetMutable<phi::DenseTensor>();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     auto stream = static_cast<const phi::GPUContext *>(&dev_ctx_)->stream();
 #else
     auto stream = nullptr;
diff --git a/paddle/fluid/operators/merge_lod_tensor_op.cc b/paddle/fluid/operators/merge_lod_tensor_op.cc
index 20775d02aadfe..afa281f3679cc 100644
--- a/paddle/fluid/operators/merge_lod_tensor_op.cc
+++ b/paddle/fluid/operators/merge_lod_tensor_op.cc
@@ -68,7 +68,8 @@ class MergeLoDTensorOp : public framework::OperatorBase {
     if (platform::is_cpu_place(mask.place())) {
       cpu_mask->ShareDataWith(mask);
     } else if (platform::is_gpu_place(mask.place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       framework::TensorCopy(
           mask, platform::CPUPlace(), dev_ctx, cpu_mask.get());
 #else
diff --git a/paddle/fluid/operators/minus_op.cc b/paddle/fluid/operators/minus_op.cc
index 27a38571e1c80..1de4eed001b92 100644
--- a/paddle/fluid/operators/minus_op.cc
+++ b/paddle/fluid/operators/minus_op.cc
@@ -157,6 +157,7 @@ REGISTER_OPERATOR(minus,
                   ops::MinusGradMaker);
 PD_REGISTER_STRUCT_KERNEL(minus, CPU, ALL_LAYOUT, ops::MinusKernel, float) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_STRUCT_KERNEL(minus, GPU, ALL_LAYOUT, ops::MinusKernel, float) {}
 #endif
diff --git a/paddle/fluid/operators/nop_op.cc b/paddle/fluid/operators/nop_op.cc
index e99b3956d05b0..cc3bbe8eac3ac 100644
--- a/paddle/fluid/operators/nop_op.cc
+++ b/paddle/fluid/operators/nop_op.cc
@@ -60,6 +60,7 @@ REGISTER_OP_WITHOUT_GRADIENT(nop, ops::NopOp, ops::NopOpMaker);
 
 PD_REGISTER_STRUCT_KERNEL(nop, CPU, ALL_LAYOUT, ops::NopKernel, float) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_STRUCT_KERNEL(nop, GPU, ALL_LAYOUT, ops::NopKernel, float) {}
 #endif
diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
index 299c2ee0dd1bb..411988f4f0560 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
@@ -37,7 +37,6 @@
 #include "math.h"  // NOLINT
 #endif
 
-
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 
diff --git a/paddle/fluid/operators/pad_constant_like_op.cc b/paddle/fluid/operators/pad_constant_like_op.cc
index 72061fbc39630..500c375212bf9 100644
--- a/paddle/fluid/operators/pad_constant_like_op.cc
+++ b/paddle/fluid/operators/pad_constant_like_op.cc
@@ -260,7 +260,8 @@ PD_REGISTER_STRUCT_KERNEL(pad_constant_like_grad,
                           int,
                           int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_STRUCT_KERNEL(pad_constant_like,
                           GPU,
                           ALL_LAYOUT,
diff --git a/paddle/fluid/operators/pscore/send_and_recv_op.cc b/paddle/fluid/operators/pscore/send_and_recv_op.cc
index 49623bb0ec206..eccc679666c58 100644
--- a/paddle/fluid/operators/pscore/send_and_recv_op.cc
+++ b/paddle/fluid/operators/pscore/send_and_recv_op.cc
@@ -107,7 +107,8 @@ PD_REGISTER_STRUCT_KERNEL(send_and_recv,
                           double,
                           int,
                           int64_t) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_STRUCT_KERNEL(send_and_recv,
                           GPU,
                           ALL_LAYOUT,
diff --git a/paddle/fluid/operators/random_crop_op.h b/paddle/fluid/operators/random_crop_op.h
index de03079b23035..91f76d2525de3 100644
--- a/paddle/fluid/operators/random_crop_op.h
+++ b/paddle/fluid/operators/random_crop_op.h
@@ -19,7 +19,8 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/for_range.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include <thrust/random.h>
 #endif
 
@@ -37,7 +38,8 @@ struct Random<phi::CPUContext> {
   using UniformIntDist = std::uniform_int_distribution<T>;
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 template <>
 struct Random<phi::GPUContext> {
   using Engine = thrust::minstd_rand;
diff --git a/paddle/fluid/operators/rank_loss_op.cc b/paddle/fluid/operators/rank_loss_op.cc
index b9f05d663dba0..712aac0e50716 100644
--- a/paddle/fluid/operators/rank_loss_op.cc
+++ b/paddle/fluid/operators/rank_loss_op.cc
@@ -246,7 +246,8 @@ PD_REGISTER_STRUCT_KERNEL(
 PD_REGISTER_STRUCT_KERNEL(
     rank_loss_grad, CPU, ALL_LAYOUT, ops::RankLossGradKernel, float) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_STRUCT_KERNEL(
     rank_loss, GPU, ALL_LAYOUT, ops::RankLossKernel, float) {}
 PD_REGISTER_STRUCT_KERNEL(
diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc
index b1f7576d3a3ee..99caa24a51078 100644
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -48,7 +48,8 @@ BufferedReader::BufferedReader(
       buffer_size_(buffer_size),
       pin_memory_(pin_memory) {
   VLOG(1) << "BufferedReader";
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (platform::is_gpu_place(place_) && !pin_memory) {
     int dev_idx = place_.device;
     compute_stream_ =
@@ -118,7 +119,8 @@ void BufferedReader::ReadAsync(size_t i) {
       return -1UL;
     }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)  // @{ Group GPU Place
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)  // @{ Group GPU Place
     if (platform::is_gpu_place(place_)) {
       TensorVec &cuda = cuda_buffer_[i];
       if (cuda.empty()) {
diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h
index db849dc70b5da..ff902cc66445b 100644
--- a/paddle/fluid/operators/reader/buffered_reader.h
+++ b/paddle/fluid/operators/reader/buffered_reader.h
@@ -21,7 +21,8 @@
 
 #include "ThreadPool.h"
 #include "paddle/fluid/framework/reader.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h"
 #endif
@@ -80,7 +81,8 @@ class BufferedReader : public framework::DecoratedReader {
   std::vector<TensorVec> xpu_buffer_;
   std::vector<TensorVec> custom_device_buffer_;
   size_t prev_pos_{-1UL};
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   gpuStream_t compute_stream_;
   std::shared_ptr<platform::CudaStreamObject> stream_;
   std::vector<std::shared_ptr<platform::CudaEventObject>> events_;
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index a089ad7d58fac..b14eef3f29beb 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -429,7 +429,8 @@ class ReshapeKernel {
                               pt_scalar_shape,
                               out);
     }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     if (platform::is_gpu_place(ctx.GetPlace())) {
       auto &dev_ctx = ctx.device_context<phi::GPUContext>();
       phi::ReshapeInferKernel(static_cast<const phi::GPUContext &>(dev_ctx),
@@ -462,7 +463,8 @@ class ReshapeGradKernel {
       phi::ReshapeGradKernel(
           static_cast<const phi::CPUContext &>(dev_ctx), *d_out, d_x);
     }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     if (platform::is_gpu_place(ctx.GetPlace())) {
       auto &dev_ctx = ctx.device_context<phi::GPUContext>();
       phi::ReshapeGradKernel(
@@ -492,7 +494,8 @@ class ReshapeDoubleGradKernel {
       phi::ReshapeDoubleGradKernel(
           static_cast<const phi::CPUContext &>(dev_ctx), *d_out, *dd_x, dd_out);
     }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     if (platform::is_gpu_place(ctx.GetPlace())) {
       auto &dev_ctx = ctx.device_context<phi::GPUContext>();
       phi::ReshapeDoubleGradKernel(
@@ -761,7 +764,8 @@ REGISTER_OPERATOR(reshape2_grad_grad,
                   ops::ReshapeDoubleGradOpNoNeedBufferVarInferer,
                   Reshape2DoubleGradInferShapeFunctor);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape,
                                 float,
                                 ops::ReshapeKernel,
diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc
index ab03d46486c2e..e786ea83fad73 100644
--- a/paddle/fluid/operators/save_op.cc
+++ b/paddle/fluid/operators/save_op.cc
@@ -117,7 +117,8 @@ PD_REGISTER_KERNEL(save_sr,
                    phi::dtype::float16,
                    phi::dtype::bfloat16) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(save,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/fluid/operators/select_op_helper.h b/paddle/fluid/operators/select_op_helper.h
index 7e3de57345a4b..4843492101b05 100644
--- a/paddle/fluid/operators/select_op_helper.h
+++ b/paddle/fluid/operators/select_op_helper.h
@@ -39,8 +39,9 @@ inline int GetBranchNumber(const phi::DenseTensor &mask) {
   }
   // when platform::is_gpu_place(mask.place()) is true
   std::unique_ptr<phi::DenseTensor> cpu_mask{new phi::DenseTensor()};
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
-    defined(PADDLE_WITH_CUSTOM_DEVICE) || defined(PADDLE_WITH_XPU)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) ||           \
+    defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_CUSTOM_DEVICE) || \
+    defined(PADDLE_WITH_XPU)
   framework::TensorCopySync(mask, platform::CPUPlace(), cpu_mask.get());
 #else
   PADDLE_THROW(platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h b/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h
index 13133e54f0415..4a715d0e35972 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h
@@ -136,7 +136,8 @@ class SequenceReverseOpKernel : public framework::OpKernel<T> {
     const size_t *lod;
     size_t lod_count = x.lod()[0].size();
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     if (platform::is_gpu_place(ctx.GetPlace())) {
       auto xlod = x.lod()[0];
       phi::MixVector<size_t> mixv_xlod(&xlod);
@@ -144,7 +145,8 @@ class SequenceReverseOpKernel : public framework::OpKernel<T> {
     } else {
 #endif
       lod = x.lod()[0].data();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     }
 #endif
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc
index e3af25c4b57f9..77f729e0f91ca 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc
@@ -16,7 +16,8 @@ limitations under the License. */
 
 #include <string>
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #endif
 
diff --git a/paddle/fluid/operators/shuffle_batch_op.cu b/paddle/fluid/operators/shuffle_batch_op.cu
index 7c8c6ca475b38..c4235a17f9918 100644
--- a/paddle/fluid/operators/shuffle_batch_op.cu
+++ b/paddle/fluid/operators/shuffle_batch_op.cu
@@ -12,7 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
 #ifndef _MSC_VER
 #include <thrust/device_ptr.h>
diff --git a/paddle/fluid/operators/split_lod_tensor_op.cc b/paddle/fluid/operators/split_lod_tensor_op.cc
index 607ea43f50105..27f947f434a07 100644
--- a/paddle/fluid/operators/split_lod_tensor_op.cc
+++ b/paddle/fluid/operators/split_lod_tensor_op.cc
@@ -69,7 +69,8 @@ class SplitLoDTensorOp : public framework::OperatorBase {
     if (platform::is_cpu_place(mask.place())) {
       cpu_mask->ShareDataWith(mask);
     } else if (platform::is_gpu_place(mask.place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       framework::TensorCopy(
           mask, platform::CPUPlace(), dev_ctx, cpu_mask.get());
 #else
diff --git a/paddle/fluid/platform/complex_test.cu b/paddle/fluid/platform/complex_test.cu
index effccd3cce75e..b78e4332d4bb0 100644
--- a/paddle/fluid/platform/complex_test.cu
+++ b/paddle/fluid/platform/complex_test.cu
@@ -27,7 +27,8 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/phi/kernels/funcs/eigen/extensions.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 namespace paddle {
 namespace platform {
 
diff --git a/paddle/fluid/platform/device/device_wrapper.h b/paddle/fluid/platform/device/device_wrapper.h
index bcfb316837a30..4a984cb34aae8 100644
--- a/paddle/fluid/platform/device/device_wrapper.h
+++ b/paddle/fluid/platform/device/device_wrapper.h
@@ -16,7 +16,8 @@ limitations under the License. */
 
 #pragma once
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
 
diff --git a/paddle/fluid/platform/device/gpu/gpu_helper.h b/paddle/fluid/platform/device/gpu/gpu_helper.h
index df6fda56f2b13..e6cac0e084ee5 100644
--- a/paddle/fluid/platform/device/gpu/gpu_helper.h
+++ b/paddle/fluid/platform/device/gpu/gpu_helper.h
@@ -13,7 +13,8 @@
 // limitations under the License.
 
 #pragma once
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
 #ifdef PADDLE_WITH_HIP
 #include "paddle/fluid/platform/device/gpu/rocm/rocm_helper.h"
diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc
index 73fe0ca05ba73..ea85562ababb6 100644
--- a/paddle/fluid/platform/device/gpu/gpu_info.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_info.cc
@@ -36,7 +36,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_HIP
 #include "paddle/fluid/platform/dynload/miopen.h"
 #elif defined(PADDLE_WITH_MUSA)
-//TODO(Xiaokang Shang)
+// TODO(Xiaokang Shang)
 #else
 #include "paddle/fluid/platform/dynload/cudnn.h"
 #include "paddle/phi/backends/gpu/cuda/cuda_graph.h"
diff --git a/paddle/fluid/platform/device/gpu/gpu_info.h b/paddle/fluid/platform/device/gpu/gpu_info.h
index 64cb1bd8fcab7..3d76f09da559b 100644
--- a/paddle/fluid/platform/device/gpu/gpu_info.h
+++ b/paddle/fluid/platform/device/gpu/gpu_info.h
@@ -11,7 +11,8 @@ limitations under the License. */
 
 #pragma once
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
 #include <stddef.h>
 
diff --git a/paddle/fluid/platform/device/gpu/gpu_launch_config.h b/paddle/fluid/platform/device/gpu/gpu_launch_config.h
index fb38b790c8540..adde00d2f1b7a 100644
--- a/paddle/fluid/platform/device/gpu/gpu_launch_config.h
+++ b/paddle/fluid/platform/device/gpu/gpu_launch_config.h
@@ -16,7 +16,8 @@
 
 #pragma once
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
 #ifdef PADDLE_WITH_CUDA
 #include <cuda_runtime.h>
diff --git a/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc b/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc
index 0fb7e061e3243..7cf3659d596e9 100644
--- a/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc
@@ -12,7 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h"
 
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
diff --git a/paddle/fluid/platform/device/gpu/gpu_resource_pool.h b/paddle/fluid/platform/device/gpu/gpu_resource_pool.h
index 8de12bba141c6..298e795524b4a 100644
--- a/paddle/fluid/platform/device/gpu/gpu_resource_pool.h
+++ b/paddle/fluid/platform/device/gpu/gpu_resource_pool.h
@@ -14,7 +14,8 @@
 
 #pragma once
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
diff --git a/paddle/fluid/platform/device/gpu/gpu_types.h b/paddle/fluid/platform/device/gpu/gpu_types.h
index 76bd3a2937972..43a08c3c3b911 100644
--- a/paddle/fluid/platform/device/gpu/gpu_types.h
+++ b/paddle/fluid/platform/device/gpu/gpu_types.h
@@ -15,7 +15,8 @@
 
 #pragma once
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
@@ -53,11 +54,20 @@ namespace paddle {
 DECLARE_TYPE_FOR_GPU(gpuStream_t, cudaStream_t, hipStream_t, musaStream_t);
 DECLARE_TYPE_FOR_GPU(gpuError_t, cudaError_t, hipError_t, musaError_t);
 DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t, musaEvent_t);
-DECLARE_TYPE_FOR_GPU(gpuMemcpyKind, cudaMemcpyKind, hipMemcpyKind, musaMemcpyKind);
-DECLARE_TYPE_FOR_GPU(gpuDeviceProp, cudaDeviceProp, hipDeviceProp_t, musaDeviceProp);
+DECLARE_TYPE_FOR_GPU(gpuMemcpyKind,
+                     cudaMemcpyKind,
+                     hipMemcpyKind,
+                     musaMemcpyKind);
+DECLARE_TYPE_FOR_GPU(gpuDeviceProp,
+                     cudaDeviceProp,
+                     hipDeviceProp_t,
+                     musaDeviceProp);
 
 DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t, mudnnHandle_t);
-DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle, mublasHandle_t);
+DECLARE_TYPE_FOR_GPU(blasHandle_t,
+                     cublasHandle_t,
+                     rocblas_handle,
+                     mublasHandle_t);
 
 using CUDAGraphID = unsigned long long;  // NOLINT
 
@@ -127,7 +137,10 @@ DECLARE_CONSTANT_FOR_GPU(gpuErrorOutOfMemory,
                          cudaErrorMemoryAllocation,
                          hipErrorOutOfMemory,
                          musaErrorMemoryAllocation);
-DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady, cudaErrorNotReady, hipErrorNotReady, musaErrorNotReady);
+DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady,
+                         cudaErrorNotReady,
+                         hipErrorNotReady,
+                         musaErrorNotReady);
 DECLARE_CONSTANT_FOR_GPU(gpuSuccess, cudaSuccess, hipSuccess, musaSuccess);
 
 #undef DECLARE_CONSTANT_FOR_GPU
diff --git a/paddle/fluid/platform/device_code_test.cc b/paddle/fluid/platform/device_code_test.cc
index a4a810b34c3f0..3070de23ca219 100644
--- a/paddle/fluid/platform/device_code_test.cc
+++ b/paddle/fluid/platform/device_code_test.cc
@@ -45,7 +45,8 @@ void saxpy_kernel(float a, float *x, float* y, float* z, size_t n) {
 )";
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 TEST(DeviceCode, cuda) {
   if (!phi::dynload::HasNVRTC() || !phi::dynload::HasCUDADriver()) {
     return;
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 4a81291815373..fac5995371c8d 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -27,7 +27,8 @@ limitations under the License. */
 #include "paddle/phi/core/expect.h"
 #include "paddle/phi/core/generator.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
@@ -53,7 +54,8 @@ DeviceType Place2DeviceType(const platform::Place& place) {
   }
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 template <typename DevCtx>
 typename std::enable_if<!std::is_same<DevCtx, phi::GPUContext>::value,
                         DevCtx*>::type
@@ -86,7 +88,8 @@ inline std::unique_ptr<DeviceContext> CreateDeviceContext(
   DevCtx* dev_ctx = ConstructDevCtx<DevCtx>(p, stream_priority);
   auto& instance = paddle::memory::allocation::AllocatorFacade::Instance();
   if (p.GetType() == phi::AllocationType::GPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     auto* cuda_ctx = dynamic_cast<phi::GPUContext*>(dev_ctx);
     PADDLE_ENFORCE_NOT_NULL(
         cuda_ctx,
@@ -172,7 +175,8 @@ void EmplaceDeviceContexts(
           /*unused*/ stream_priority);
 #endif
     } else if (place.GetType() == phi::AllocationType::GPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       EmplaceDeviceContext<phi::GPUContext>(
           place_to_device_context,
           place,
@@ -209,7 +213,8 @@ void EmplaceDeviceContexts(
           "option."));
 #endif
     } else if (platform::is_cuda_pinned_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       EmplaceDeviceContext<CUDAPinnedDeviceContext>(
           place_to_device_context,
           place,
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 453d9e9f1e18d..0e54bab9a6871 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -147,7 +147,8 @@ namespace xpu = baidu::xpu::api;
 using XPUDeviceContext = phi::XPUContext;
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 using CUDAPinnedDeviceContext = phi::GPUPinnedContext;
 #endif
 
@@ -176,7 +177,8 @@ struct DefaultDeviceContextType<phi::IPUPlace> {
 };
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 template <>
 struct DefaultDeviceContextType<phi::GPUPinnedPlace> {
   using TYPE = paddle::platform::CUDAPinnedDeviceContext;
diff --git a/paddle/fluid/platform/device_event.h b/paddle/fluid/platform/device_event.h
index cb43f00f7fe0f..2287ffada5872 100644
--- a/paddle/fluid/platform/device_event.h
+++ b/paddle/fluid/platform/device_event.h
@@ -31,7 +31,8 @@ using ::paddle::platform::kXPU;
 USE_EVENT(kCPU)
 USE_EVENT_WAIT(kCPU, kCPU)
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 USE_EVENT(kCUDA);
 USE_EVENT_WAIT(kCUDA, kCUDA)
 USE_EVENT_WAIT(kCPU, kCUDA)
diff --git a/paddle/fluid/platform/device_event_gpu.cc b/paddle/fluid/platform/device_event_gpu.cc
index 09861f41874cd..f0bbb411abb89 100644
--- a/paddle/fluid/platform/device_event_gpu.cc
+++ b/paddle/fluid/platform/device_event_gpu.cc
@@ -15,7 +15,8 @@
 #include "paddle/fluid/platform/device_event_base.h"
 #include "paddle/fluid/platform/event.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 namespace paddle {
 namespace platform {
 struct CUDADeviceEventWrapper {
diff --git a/paddle/fluid/platform/dynload/musparse.h b/paddle/fluid/platform/dynload/musparse.h
index 7669446ed1025..758c39104433e 100644
--- a/paddle/fluid/platform/dynload/musparse.h
+++ b/paddle/fluid/platform/dynload/musparse.h
@@ -29,27 +29,27 @@ namespace dynload {
   extern DynLoad__##__name __name
 
 #if defined(PADDLE_WITH_MUSA)
-#define MUSPARSE_ROUTINE_EACH(__macro)    \
-  __macro(musparseSetStream);             \
-  __macro(musparseCreateMatDescr);        \
-  __macro(musparseSnnz);                  \
-  __macro(musparseDnnz);                  \
-  __macro(musparseSetMatType);            \
-  __macro(musparseSetMatIndexBase);       \
-  __macro(musparseCreateCsr);             \
-  __macro(musparseCreateCoo);             \
-  __macro(musparseCreateDnMat);           \
-  __macro(musparseCreateDnVec);           \
-  __macro(musparseSpMM);                  \
-  __macro(musparseDestroySpMat);          \
-  __macro(musparseDestroyDnMat);          \
-  __macro(musparseDestroyDnVec);          \
-  __macro(musparseSpMV);                  \
-  __macro(musparseSDDMM_bufferSize);      \
-  __macro(musparseSDDMM_preprocess);      \
-  __macro(musparseSDDMM);                 \
-  __macro(musparseDnMatSetStridedBatch);  \
-  __macro(musparseCooSetStridedBatch);    \
+#define MUSPARSE_ROUTINE_EACH(__macro)   \
+  __macro(musparseSetStream);            \
+  __macro(musparseCreateMatDescr);       \
+  __macro(musparseSnnz);                 \
+  __macro(musparseDnnz);                 \
+  __macro(musparseSetMatType);           \
+  __macro(musparseSetMatIndexBase);      \
+  __macro(musparseCreateCsr);            \
+  __macro(musparseCreateCoo);            \
+  __macro(musparseCreateDnMat);          \
+  __macro(musparseCreateDnVec);          \
+  __macro(musparseSpMM);                 \
+  __macro(musparseDestroySpMat);         \
+  __macro(musparseDestroyDnMat);         \
+  __macro(musparseDestroyDnVec);         \
+  __macro(musparseSpMV);                 \
+  __macro(musparseSDDMM_bufferSize);     \
+  __macro(musparseSDDMM_preprocess);     \
+  __macro(musparseSDDMM);                \
+  __macro(musparseDnMatSetStridedBatch); \
+  __macro(musparseCooSetStridedBatch);   \
   __macro(musparseCsrSetStridedBatch);
 
 MUSPARSE_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MUSPARSE_WRAP)
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index 160d6fb9912cb..ff33ea379d20c 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -108,7 +108,8 @@ limitations under the License. */
 #include "paddle/fluid/imperative/type_defs.h"
 #include "paddle/phi/core/enforce.h"
 // Note: this header for simplify HIP and CUDA type string
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/device/gpu/gpu_types.h"
 #endif
 #include "paddle/phi/core/flags.h"
diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc
index ef435721b93a0..346b27d0f9e42 100644
--- a/paddle/fluid/platform/enforce_test.cc
+++ b/paddle/fluid/platform/enforce_test.cc
@@ -345,7 +345,8 @@ TEST(EOF_EXCEPTION, THROW_EOF) {
   EXPECT_TRUE(caught_eof);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 template <typename T>
 bool CheckCudaStatusSuccess(T value, const std::string& msg = "success") {
   PADDLE_ENFORCE_GPU_SUCCESS(value);
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index 4afa964d5f823..bce0890daecf9 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -18,7 +18,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/string/split.h"
 #include "paddle/phi/backends/cpu/cpu_info.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
@@ -172,7 +173,8 @@ void InitDevices() {
 #endif
     /*Init all available devices by default */
     std::vector<int> devices;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     try {
       // use user specified GPUs in single-node multi-process mode.
       devices = platform::GetSelectedDevices();
@@ -215,7 +217,8 @@ void InitDevices(const std::vector<int> devices) {
       continue;
     }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     places.emplace_back(platform::CUDAPlace(devices[i]));
 #endif
 #ifdef PADDLE_WITH_XPU
@@ -226,7 +229,8 @@ void InitDevices(const std::vector<int> devices) {
 #endif
   }
   places.emplace_back(platform::CPUPlace());
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   places.emplace_back(platform::CUDAPinnedPlace());
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
@@ -448,7 +452,8 @@ void InitMemoryMethod() {
     memory_method->copy = paddle::memory::Copy<phi::Place, phi::Place>;
     memory_method->device_memory_stat_current_value =
         paddle::memory::DeviceMemoryStatCurrentValue;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     memory_method->gpu_memory_usage = paddle::platform::GpuMemoryUsage;
 #endif
     memory_method->emplace_device_contexts =
diff --git a/paddle/fluid/platform/init_test.cc b/paddle/fluid/platform/init_test.cc
index b643e37765668..3cb6ea34bdaff 100644
--- a/paddle/fluid/platform/init_test.cc
+++ b/paddle/fluid/platform/init_test.cc
@@ -32,7 +32,8 @@ TEST(InitDevices, CUDA) {
   using paddle::framework::InitDevices;
   using paddle::platform::DeviceContextPool;
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   int count = paddle::platform::GetGPUDeviceCount();
   InitDevices();
   DeviceContextPool& pool = DeviceContextPool::Instance();
diff --git a/paddle/fluid/platform/place.h b/paddle/fluid/platform/place.h
index 9e00bd589dc70..d2c1a448b633c 100644
--- a/paddle/fluid/platform/place.h
+++ b/paddle/fluid/platform/place.h
@@ -57,7 +57,8 @@ typename Visitor::result_type VisitPlace(const Place &place,
                                          const Visitor &visitor) {
   switch (place.GetType()) {
     case phi::AllocationType::GPU: {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       platform::CUDAPlace p(place.GetDeviceId());
       return visitor(p);
 #else
@@ -67,7 +68,8 @@ typename Visitor::result_type VisitPlace(const Place &place,
 #endif
     }
     case phi::AllocationType::GPUPINNED: {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       platform::CUDAPinnedPlace p;
       return visitor(p);
 #else
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index d1b557922af32..979219fb1920b 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -698,7 +698,8 @@ void EnableProfiler(ProfilerState state) {
   HostTraceLevel::GetInstance().SetLevel(option.trace_level);
   should_send_profile_state = true;
   phi::GetDeviceTracer()->Enable();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (phi::ProfilerHelper::g_state == ProfilerState::kCUDA ||
       phi::ProfilerHelper::g_state == ProfilerState::kAll ||
       phi::ProfilerHelper::g_state == ProfilerState::kCPU) {
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index 7a13582736a50..607961ceebda3 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -31,7 +31,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "paddle/fluid/platform/profiler/mem_tracing.h"
 #include "paddle/fluid/platform/profiler/supplement_tracing.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
 
@@ -197,7 +198,8 @@ std::string OpName(const framework::VariableNameMap& name_map,
                    const std::string& type_name);
 void SetTracerOption(TracerOption option);
 platform::TracerOption GetTracerOption();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 void DummyKernelAndEvent();
 #endif
 
diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.cc b/paddle/fluid/platform/profiler/chrometracing_logger.cc
index 8fa4d8a483c4d..4bd2be19c15bd 100644
--- a/paddle/fluid/platform/profiler/chrometracing_logger.cc
+++ b/paddle/fluid/platform/profiler/chrometracing_logger.cc
@@ -561,7 +561,8 @@ void ChromeTracingLogger::LogMetaInfo(const std::string& version,
                                        span_indx);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 void ChromeTracingLogger::LogDeviceProperty(
     const std::map<uint32_t, gpuDeviceProp>& device_property_map) {
   // add device property information
diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.h b/paddle/fluid/platform/profiler/chrometracing_logger.h
index 81005aa91c10d..6ad4883b89944 100644
--- a/paddle/fluid/platform/profiler/chrometracing_logger.h
+++ b/paddle/fluid/platform/profiler/chrometracing_logger.h
@@ -40,7 +40,8 @@ class ChromeTracingLogger : public BaseLogger {
   void LogNodeTrees(const NodeTrees&) override;
   void LogExtraInfo(const std::unordered_map<std::string, std::string>);
   void LogMemTraceEventNode(const MemTraceEventNode&) override;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   void LogDeviceProperty(
       const std::map<uint32_t, gpuDeviceProp>& device_property_map);
 #endif
diff --git a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc
index cc35371e06fc5..f73423d84a69b 100644
--- a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc
+++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc
@@ -129,7 +129,8 @@ std::unique_ptr<ProfilerResult> DeserializationReader::Parse() {
   // restore NodeTrees object
   std::unique_ptr<NodeTrees> tree(new NodeTrees(thread_event_trees_map));
 // restore gpuDeviceProp
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   std::map<uint32_t, gpuDeviceProp> device_property_map;
   for (auto indx = 0; indx < node_trees_proto_->device_property_size();
        indx++) {
@@ -155,7 +156,8 @@ DeserializationReader::~DeserializationReader() {
   input_file_stream_.close();
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 gpuDeviceProp DeserializationReader::RestoreDeviceProperty(
     const DevicePropertyProto& device_property_proto) {
   gpuDeviceProp device_property;
diff --git a/paddle/fluid/platform/profiler/dump/deserialization_reader.h b/paddle/fluid/platform/profiler/dump/deserialization_reader.h
index c8ac33c5bea49..8f3f1766e126b 100644
--- a/paddle/fluid/platform/profiler/dump/deserialization_reader.h
+++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.h
@@ -39,7 +39,8 @@ class DeserializationReader {
   MemTraceEventNode* RestoreMemTraceEventNode(const MemTraceEventNodeProto&);
   OperatorSupplementEventNode* RestoreOperatorSupplementEventNode(
       const OperatorSupplementEventNodeProto&);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   gpuDeviceProp RestoreDeviceProperty(const DevicePropertyProto&);
 #endif
 
diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.cc b/paddle/fluid/platform/profiler/dump/serialization_logger.cc
index 9e46e3a531cd9..9fce9e3eeecf8 100644
--- a/paddle/fluid/platform/profiler/dump/serialization_logger.cc
+++ b/paddle/fluid/platform/profiler/dump/serialization_logger.cc
@@ -40,7 +40,8 @@ void SerializationLogger::OpenFile() {
   node_trees_proto_ = new NodeTreesProto();
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 void SerializationLogger::LogDeviceProperty(
     const std::map<uint32_t, gpuDeviceProp>& device_property_map) {
   for (auto it = device_property_map.begin(); it != device_property_map.end();
diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.h b/paddle/fluid/platform/profiler/dump/serialization_logger.h
index 67eafdf44e3cd..6ff84150436c7 100644
--- a/paddle/fluid/platform/profiler/dump/serialization_logger.h
+++ b/paddle/fluid/platform/profiler/dump/serialization_logger.h
@@ -37,7 +37,8 @@ class SerializationLogger : public BaseLogger {
   void LogNodeTrees(const NodeTrees&) override;
   void LogExtraInfo(const std::unordered_map<std::string, std::string>);
   void LogMemTraceEventNode(const MemTraceEventNode&) override;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   void LogDeviceProperty(
       const std::map<uint32_t, gpuDeviceProp>& device_property_map);
 #endif
diff --git a/paddle/fluid/platform/profiler/event_python.cc b/paddle/fluid/platform/profiler/event_python.cc
index 7ec41fd78a5e3..14d81876233fd 100644
--- a/paddle/fluid/platform/profiler/event_python.cc
+++ b/paddle/fluid/platform/profiler/event_python.cc
@@ -137,7 +137,8 @@ HostPythonNode* ProfilerResult::CopyTree(HostTraceEventNode* root) {
   return host_python_node;
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 ProfilerResult::ProfilerResult(
     std::unique_ptr<NodeTrees> tree,
     const ExtraInfo& extra_info,
@@ -179,7 +180,8 @@ void ProfilerResult::Save(const std::string& file_name,
   if (format == std::string("json")) {
     ChromeTracingLogger logger(file_name);
     logger.LogMetaInfo(version_, span_indx_);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     logger.LogDeviceProperty(device_property_map_);
 #endif
     tree_->LogMe(&logger);
@@ -187,7 +189,8 @@ void ProfilerResult::Save(const std::string& file_name,
   } else if (format == std::string("pb")) {
     SerializationLogger logger(file_name);
     logger.LogMetaInfo(version_, span_indx_);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     logger.LogDeviceProperty(device_property_map_);
 #endif
     tree_->LogMe(&logger);
diff --git a/paddle/fluid/platform/profiler/event_python.h b/paddle/fluid/platform/profiler/event_python.h
index f1d217674bf6c..964fcc4c19050 100644
--- a/paddle/fluid/platform/profiler/event_python.h
+++ b/paddle/fluid/platform/profiler/event_python.h
@@ -138,7 +138,8 @@ struct HostPythonNode {
 class ProfilerResult {
  public:
   ProfilerResult() : tree_(nullptr) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   explicit ProfilerResult(
       std::unique_ptr<NodeTrees> tree,
       const ExtraInfo& extra_info,
@@ -166,7 +167,8 @@ class ProfilerResult {
 
   std::string GetVersion() { return version_; }
   uint32_t GetSpanIndx() { return span_indx_; }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   std::map<uint32_t, gpuDeviceProp> GetDeviceProperty() {
     return device_property_map_;
   }
@@ -176,7 +178,8 @@ class ProfilerResult {
   std::map<uint64_t, HostPythonNode*> thread_event_trees_map_;
   std::shared_ptr<NodeTrees> tree_;
   ExtraInfo extra_info_;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   std::map<uint32_t, gpuDeviceProp> device_property_map_;
 #endif
   std::string version_;
diff --git a/paddle/fluid/platform/profiler/profiler.cc b/paddle/fluid/platform/profiler/profiler.cc
index ca3211ba103aa..76a1b347a363f 100644
--- a/paddle/fluid/platform/profiler/profiler.cc
+++ b/paddle/fluid/platform/profiler/profiler.cc
@@ -24,7 +24,8 @@
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
 #include "paddle/fluid/platform/enforce.h"
@@ -167,7 +168,8 @@ std::unique_ptr<ProfilerResult> Profiler::Stop() {
                            std::string("%s"),
                            kv.second.c_str());
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   std::map<uint32_t, gpuDeviceProp> device_property_map;
   std::vector<int32_t> device_ids = GetSelectedDevices();
   for (auto index = 0u; index < device_ids.size(); index++) {
diff --git a/paddle/fluid/platform/profiler_helper.h b/paddle/fluid/platform/profiler_helper.h
index 2fa0ece0f9883..2e00826744091 100644
--- a/paddle/fluid/platform/profiler_helper.h
+++ b/paddle/fluid/platform/profiler_helper.h
@@ -154,7 +154,8 @@ void PrintMemProfiler(
             << "    Memory Profiling Report     "
             << "<-------------------------\n\n";
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   int num_gpus = GetGPUDeviceCount();
   std::cout.setf(std::ios::left);
   if (num_gpus > 0) {
@@ -356,7 +357,8 @@ void SetEvent(bool merge_thread,
     if (rit != pushed_events->rend()) {
       double event_time = 0;
       double gpu_time = 0.0f;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       gpu_time = rit->CudaElapsedMs(analyze_event);
 #endif
       double cpu_time = rit->CpuElapsedMs(analyze_event);
diff --git a/paddle/fluid/platform/profiler_test.cc b/paddle/fluid/platform/profiler_test.cc
index 1b746df388a2b..af59782d5c926 100644
--- a/paddle/fluid/platform/profiler_test.cc
+++ b/paddle/fluid/platform/profiler_test.cc
@@ -122,7 +122,8 @@ TEST(RecordEvent, RecordEvent) {
       if (events[i][j].name() == "_start_profiler_") ++start_profiler_count;
       if (events[i][j].name() == "push") {
         EXPECT_EQ(events[i][j + 1].name(), "pop");
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
         EXPECT_GT(events[i][j].CudaElapsedMs(events[i][j + 1]), 0);
 #else
         EXPECT_GT(events[i][j].CpuElapsedMs(events[i][j + 1]), 0);
diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc
index c6e70b19fcf8c..69a1d2c575421 100644
--- a/paddle/fluid/platform/stream_callback_manager.cc
+++ b/paddle/fluid/platform/stream_callback_manager.cc
@@ -25,13 +25,13 @@ static void StreamCallbackFunc(gpuStream_t stream,
                                void *user_data)
 #endif
 #ifdef PADDLE_WITH_MUSA
-static void MUSART_CB StreamCallbackFunc(void *user_data)
+    static void MUSART_CB StreamCallbackFunc(void *user_data)
 #endif
 #ifdef PADDLE_WITH_CUDA
 #if CUDA_VERSION >= 10000
-    static void CUDART_CB StreamCallbackFunc(void *user_data)
+        static void CUDART_CB StreamCallbackFunc(void *user_data)
 #else
-    static void CUDART_CB
+        static void CUDART_CB
     StreamCallbackFunc(cudaStream_t stream, cudaError_t status, void *user_data)
 #endif
 #endif
@@ -78,7 +78,8 @@ void StreamCallbackManager<Stream>::AddCallback(
 
 template <typename Stream>
 void StreamCallbackManager<Stream>::Wait() const {
-#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_CUDA) || \
+    defined(PADDLE_WITH_MUSA)
   platform::GpuStreamSync(stream_);
 #endif
   {
diff --git a/paddle/fluid/pybind/cuda_streams_py.cc b/paddle/fluid/pybind/cuda_streams_py.cc
index de97b39218157..b320f96839e4c 100644
--- a/paddle/fluid/pybind/cuda_streams_py.cc
+++ b/paddle/fluid/pybind/cuda_streams_py.cc
@@ -24,7 +24,8 @@ namespace py = pybind11;
 
 namespace paddle {
 namespace platform {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 phi::CUDAStream *get_current_stream(int device_id) {
   if (device_id == -1) {
     device_id = phi::backends::gpu::GetCurrentDeviceId();
@@ -51,7 +52,8 @@ void BindCudaStream(py::module *m_ptr) {
   m.def(
       "_get_current_stream",
       [](int deviceId) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
         return platform::get_current_stream(deviceId);
 #else
         PADDLE_THROW(
@@ -64,7 +66,8 @@ void BindCudaStream(py::module *m_ptr) {
   m.def(
       "_set_current_stream",
       [](phi::CUDAStream *stream) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
         return platform::set_current_stream(stream);
 #else
         PADDLE_THROW(
@@ -75,7 +78,8 @@ void BindCudaStream(py::module *m_ptr) {
       py::return_value_policy::reference);
 
   m.def("_device_synchronize", [](int device_id) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     if (device_id == -1) {
       device_id = paddle::platform::GetCurrentDeviceId();
     }
@@ -117,7 +121,8 @@ void BindCudaStream(py::module *m_ptr) {
             s3 = paddle.device.cuda.Stream()
 
   )DOC")
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       .def(
           "wait_event",
           [](phi::CUDAStream &self, paddle::platform::CudaEvent &event) {
@@ -253,7 +258,8 @@ void BindCudaStream(py::module *m_ptr) {
       .def(
           "__init__",
           [](phi::CUDAStream &self, platform::CUDAPlace *place, int priority) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
             if (priority != 1 && priority != 2) {
               PADDLE_THROW(platform::errors::InvalidArgument(
                   "Priority should be 1(high) or 2(normal) "));
@@ -279,7 +285,8 @@ void BindCudaStream(py::module *m_ptr) {
       .def(
           "__init__",
           [](phi::CUDAStream &self, int device, int priority) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
             if (priority != 1 && priority != 2) {
               PADDLE_THROW(platform::errors::InvalidArgument(
                   "Priority should be 1(high) or 2(normal) "));
@@ -309,7 +316,8 @@ void BindCudaStream(py::module *m_ptr) {
           py::arg("device") = -1,
           py::arg("priority") = 2)
       .def("__init__", [](phi::CUDAStream &self) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
         int device_id = platform::GetCurrentDeviceId();
         auto stream_flag = phi::CUDAStream::StreamFlag::kStreamNonBlocking;
         new (&self) phi::CUDAStream(
@@ -336,7 +344,8 @@ void BindCudaStream(py::module *m_ptr) {
             event = paddle.device.cuda.Event()
 
   )DOC")
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       .def(
           "record",
           [](paddle::platform::CudaEvent &self, phi::CUDAStream *stream) {
@@ -400,7 +409,8 @@ void BindCudaStream(py::module *m_ptr) {
              bool enable_timing,
              bool blocking,
              bool interprocess) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
             unsigned int flags = platform::GenerateDeviceEventFlag(
                 enable_timing, blocking, interprocess);
             new (&self) paddle::platform::CudaEvent(flags);
diff --git a/paddle/fluid/pybind/cuda_streams_py.h b/paddle/fluid/pybind/cuda_streams_py.h
index 61f27960e25e9..41e62fd92aefb 100644
--- a/paddle/fluid/pybind/cuda_streams_py.h
+++ b/paddle/fluid/pybind/cuda_streams_py.h
@@ -17,7 +17,8 @@
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/phi/core/cuda_stream.h"
 #else
 namespace phi {
@@ -29,7 +30,8 @@ namespace py = pybind11;
 
 namespace paddle {
 namespace platform {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 phi::CUDAStream* get_current_stream(int device_id = -1);
 phi::CUDAStream* set_current_stream(phi::CUDAStream* stream);
 #endif
diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc
index 7fdfcfe62f6a6..d560e11da0674 100644
--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -58,7 +58,8 @@ typedef SSIZE_T ssize_t;
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/pybind/cuda_streams_py.h"
 #endif
 
diff --git a/paddle/fluid/pybind/eager_math_op_patch.cc b/paddle/fluid/pybind/eager_math_op_patch.cc
index 46e099c1ecf5f..72a1df8e0ace9 100644
--- a/paddle/fluid/pybind/eager_math_op_patch.cc
+++ b/paddle/fluid/pybind/eager_math_op_patch.cc
@@ -138,7 +138,8 @@ std::set<phi::DataType> _complex_dtypes{
 
 void SetDevice(paddle::platform::Place place) {
   if (paddle::platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     phi::backends::gpu::SetDeviceId(place.device);
     VLOG(6) << "CurrentDeviceId: " << phi::backends::gpu::GetCurrentDeviceId()
             << " from " << static_cast<int>(place.device);
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index e8a8d5e12870f..a911d593f76c1 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -223,7 +223,8 @@ static PyObject* tensor_method_numpy(TensorObject* self,
           sizeof_dtype * numel);
     }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   } else if (self->tensor.is_gpu()) {
     eager_gil_scoped_release guard;
 #if defined(PADDLE_WITH_CUDA)
@@ -1340,7 +1341,8 @@ static PyObject* tensor_method__setitem_eager_tensor(TensorObject* self,
       self_numpy[_index] = py::object(py::handle(value_obj), true);
     }
     if (!self->tensor.initialized()) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       SetTensorFromPyArray(self_tensor,
                            self_numpy,
                            platform::Place(platform::CUDAPlace(0)),
diff --git a/paddle/fluid/pybind/generator_py.cc b/paddle/fluid/pybind/generator_py.cc
index c9a4e2b7fb52e..598272ee09aff 100644
--- a/paddle/fluid/pybind/generator_py.cc
+++ b/paddle/fluid/pybind/generator_py.cc
@@ -40,7 +40,8 @@ void BindGenerator(py::module* m_ptr) {
            [](std::shared_ptr<phi::Generator::GeneratorState>& self) {
              return self->current_seed;
            })
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       // NOTE(shenliang03): Due to the inability to serialize mt19937_64
       // type, resulting in a problem with precision under the cpu.
       .def(py::pickle(
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index 72b47bb154513..f27a7adc62a07 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -43,7 +43,8 @@
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/phi/core/cuda_stream.h"
 #endif
 
@@ -658,7 +659,8 @@ void BindPaddlePredictor(py::module *m) {
       .def("get_output_names", &PaddlePredictor::GetOutputNames)
       .def("zero_copy_run", &PaddlePredictor::ZeroCopyRun)
       .def("clone", [](PaddlePredictor &self) { return self.Clone(nullptr); })
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       .def("clone",
            [](PaddlePredictor &self, phi::CUDAStream &stream) {
              return self.Clone(stream.raw_stream());
@@ -705,7 +707,8 @@ void BindNativePredictor(py::module *m) {
       .def("zero_copy_run", &NativePaddlePredictor::ZeroCopyRun)
       .def("clone",
            [](NativePaddlePredictor &self) { return self.Clone(nullptr); })
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       .def("clone",
            [](NativePaddlePredictor &self, phi::CUDAStream &stream) {
              return self.Clone(stream.raw_stream());
@@ -750,7 +753,8 @@ void BindAnalysisConfig(py::module *m) {
       .def("exp_enable_use_cutlass", &AnalysisConfig::Exp_EnableUseCutlass)
       .def("exp_disable_mixed_precision_ops",
            &AnalysisConfig::Exp_DisableMixedPrecisionOps)
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       .def("set_exec_stream",
            [](AnalysisConfig &self, phi::CUDAStream &stream) {
              self.SetExecStream(stream.raw_stream());
@@ -1084,7 +1088,8 @@ void BindAnalysisPredictor(py::module *m) {
            &AnalysisPredictor::analysis_argument,
            py::return_value_policy::reference)
       .def("clone", [](AnalysisPredictor &self) { return self.Clone(nullptr); })
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       .def("clone",
            [](AnalysisPredictor &self, phi::CUDAStream &stream) {
              return self.Clone(stream.raw_stream());
@@ -1122,7 +1127,8 @@ void BindPaddleInferPredictor(py::module *m) {
       .def("run", [](paddle_infer::Predictor &self) { self.Run(); })
       .def("clone",
            [](paddle_infer::Predictor &self) { return self.Clone(nullptr); })
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       .def("clone",
            [](paddle_infer::Predictor &self, phi::CUDAStream &stream) {
              return self.Clone(stream.raw_stream());
diff --git a/paddle/fluid/pybind/parallel_executor.cc b/paddle/fluid/pybind/parallel_executor.cc
index d7a43b01fa7c1..aee4dd8b07a04 100644
--- a/paddle/fluid/pybind/parallel_executor.cc
+++ b/paddle/fluid/pybind/parallel_executor.cc
@@ -126,7 +126,8 @@ limitations under the License. */
 #include "paddle/fluid/pybind/reader_py.h"
 #include "paddle/fluid/pybind/tensor_py.h"
 #include "paddle/fluid/string/to_string.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
 #endif
diff --git a/paddle/fluid/pybind/place.cc b/paddle/fluid/pybind/place.cc
index 6a87075a1a6e3..6c76c61542528 100644
--- a/paddle/fluid/pybind/place.cc
+++ b/paddle/fluid/pybind/place.cc
@@ -126,7 +126,8 @@ limitations under the License. */
 #include "paddle/fluid/pybind/reader_py.h"
 #include "paddle/fluid/pybind/tensor_py.h"
 #include "paddle/fluid/string/to_string.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
 #endif
@@ -318,7 +319,8 @@ void BindPlace(pybind11::module &m) {  // NOLINT
   cudaplace
       .def("__init__",
            [](platform::CUDAPlace &self, int dev_id) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
              if (UNLIKELY(dev_id < 0)) {
                LOG(ERROR) << string::Sprintf(
                    "Invalid CUDAPlace(%d), device id must be 0 or "
@@ -357,7 +359,8 @@ void BindPlace(pybind11::module &m) {  // NOLINT
              std::exit(-1);
 #endif
            })
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       .def("get_device_id",
            [](const platform::CUDAPlace &self) { return self.GetDeviceId(); })
       .def("_type", &PlaceIndex<platform::CUDAPlace>)
@@ -372,7 +375,8 @@ void BindPlace(pybind11::module &m) {  // NOLINT
 #endif
       .def("__repr__", string::to_string<const platform::CUDAPlace &>)
       .def("__str__", string::to_string<const platform::CUDAPlace &>);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool {
   // Only GPUs with Compute Capability >= 53 support float16
 #if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
@@ -540,7 +544,8 @@ void BindPlace(pybind11::module &m) {  // NOLINT
   cudapinnedplace
       .def("__init__",
            [](platform::CUDAPinnedPlace &self) {
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && \
+    !defined(PADDLE_WITH_MUSA)
              PADDLE_THROW(platform::errors::PermissionDenied(
                  "Cannot use CUDAPinnedPlace in CPU only version, "
                  "Please recompile or reinstall Paddle with CUDA support."));
diff --git a/paddle/fluid/pybind/process_group_utils.h b/paddle/fluid/pybind/process_group_utils.h
index 1a5a048a61383..85fde515754a5 100644
--- a/paddle/fluid/pybind/process_group_utils.h
+++ b/paddle/fluid/pybind/process_group_utils.h
@@ -250,7 +250,8 @@ void ConcatTensor(const phi::DeviceContext &dev_ctx,
 
   const auto &place = dev_ctx.GetPlace();
   if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     ConcatDenseTensorWithType(static_cast<const phi::GPUContext &>(dev_ctx),
                               tensor_list,
                               dense_tensor,
@@ -307,7 +308,8 @@ void SplitTensor(const phi::DeviceContext &dev_ctx,
 
   const auto &place = dev_ctx.GetPlace();
   if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     SplitDenseTensorWithType(static_cast<const phi::GPUContext &>(dev_ctx),
                              tensor,
                              &dense_list,
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 5dc31e6231da4..f63330c76a5fe 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -144,7 +144,8 @@ limitations under the License. */
 #include "paddle/fluid/pybind/tensor.h"
 #include "paddle/fluid/pybind/tensor_py.h"
 #include "paddle/fluid/string/to_string.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
 #endif
@@ -231,7 +232,8 @@ bool IsCompiledWithAVX() {
 }
 
 bool IsCompiledWithCUDA() {
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && \
+    !defined(PADDLE_WITH_MUSA)
   return false;
 #else
   return true;
@@ -776,7 +778,8 @@ PYBIND11_MODULE(libpaddle, m) {
           }
         });
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   m.def("cudnn_version", &platform::DnnVersion);
   m.def("gpu_memory_available", []() {
     size_t available = 0;
@@ -828,7 +831,8 @@ PYBIND11_MODULE(libpaddle, m) {
     if (dl.device.device_type == kDLCPU) {
       paddle::framework::TensorFromDLPack(dmt, &tensor);
     }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     if (dl.device.device_type == kDLGPU) {
       paddle::framework::TensorFromDLPack(dmt, &tensor);
     }
@@ -1563,7 +1567,8 @@ All parameter, weight, gradient are variables in Paddle.
           "create",
           [](paddle::platform::CUDAPlace &place)
               -> paddle::platform::DeviceContext * {
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && \
+    !defined(PADDLE_WITH_MUSA)
             PADDLE_THROW(platform::errors::PermissionDenied(
                 "Cannot use CUDAPlace in CPU only version, "
                 "Please recompile or reinstall Paddle with CUDA support."));
@@ -1597,7 +1602,8 @@ All parameter, weight, gradient are variables in Paddle.
           "create",
           [](paddle::platform::CUDAPinnedPlace &place)
               -> paddle::platform::DeviceContext * {
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && \
+    !defined(PADDLE_WITH_MUSA)
             PADDLE_THROW(platform::errors::PermissionDenied(
                 "Cannot use CUDAPinnedPlace in CPU only version, "
                 "Please recompile or reinstall Paddle with CUDA support."));
@@ -2199,7 +2205,8 @@ All parameter, weight, gradient are variables in Paddle.
           py::return_value_policy::take_ownership);
 
   m.def("op_support_gpu", OpSupportGPU);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   m.def("get_cuda_device_count", platform::GetGPUDeviceCount);
   m.def("get_cuda_current_device_id", &platform::GetCurrentDeviceId);
   m.def("cuda_empty_cache", [] {
@@ -2320,7 +2327,8 @@ All parameter, weight, gradient are variables in Paddle.
       .def("save", &paddle::platform::ProfilerResult::Save)
       .def("get_extra_info", &paddle::platform::ProfilerResult::GetExtraInfo)
       .def("get_version", &paddle::platform::ProfilerResult::GetVersion)
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       .def("get_span_indx", &paddle::platform::ProfilerResult::GetSpanIndx)
       .def("get_device_property",
            &paddle::platform::ProfilerResult::GetDeviceProperty);
@@ -2477,7 +2485,8 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("enable_op_info_recorder", &phi::EnableOpInfoRecorder);
   m.def("disable_op_info_recorder", &phi::DisableOpInfoRecorder);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   m.def("set_cublas_switch", phi::SetAllowTF32Cublas);
   m.def("get_cublas_switch", phi::AllowTF32Cublas);
   m.def("set_cudnn_switch", phi::SetAllowTF32Cudnn);
diff --git a/paddle/fluid/pybind/tensor.cc b/paddle/fluid/pybind/tensor.cc
index 7d5d645a3b195..cee763c6530f1 100644
--- a/paddle/fluid/pybind/tensor.cc
+++ b/paddle/fluid/pybind/tensor.cc
@@ -126,7 +126,8 @@ limitations under the License. */
 #include "paddle/fluid/pybind/reader_py.h"
 #include "paddle/fluid/pybind/tensor_py.h"
 #include "paddle/fluid/string/to_string.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
 #endif
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index b7375243d8db9..c0eaa9dc3d524 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -37,7 +37,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/pybind/complex.h"
 #include "paddle/phi/kernels/funcs/strided_memcpy.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
 #include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
@@ -325,7 +326,8 @@ T TensorGetElement(const phi::DenseTensor &self, size_t offset) {
 #endif
   } else if (platform::is_gpu_place(self.place()) ||
              platform::is_cuda_pinned_place(self.place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     const T *a = self.data<T>();
     auto p = self.place();
     paddle::memory::Copy(
@@ -362,7 +364,8 @@ void TensorSetElement(phi::DenseTensor *self, size_t offset, T elem) {
 #endif
   } else if (platform::is_gpu_place(self->place()) ||
              platform::is_cuda_pinned_place(self->place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     auto p = self->place();
     T *a = self->mutable_data<T>(p);
     paddle::memory::Copy(
@@ -457,7 +460,8 @@ void SetTensorFromPyArrayT(
         "Please recompile or reinstall Paddle with CustomDevice support."));
 #endif
   } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     if (paddle::platform::is_gpu_place(place)) {
       // NOTE(wangxi): When copying data to the accelerator card,
       // we need set_device(dev_id) first.
@@ -793,7 +797,8 @@ inline phi::DenseTensor *_getTensor(const phi::DenseTensor &self,
     output->mutable_data(place, self.dtype());
 #endif
   } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     if (platform::is_cuda_pinned_place(place)) {
       output->mutable_data(place, self.dtype());
     } else if ((platform::is_gpu_place(place))) {
@@ -1042,7 +1047,8 @@ inline py::array TensorToPyArray(const phi::DenseTensor &tensor,
         "Please recompile or reinstall Paddle with XPU support."));
 #endif
   } else if (is_gpu_tensor) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides);
     PADDLE_ENFORCE_EQ(py_arr.writeable(),
                       true,
diff --git a/paddle/phi/api/include/context_pool.h b/paddle/phi/api/include/context_pool.h
index 65ddeceb7014c..b694dc8013a30 100644
--- a/paddle/phi/api/include/context_pool.h
+++ b/paddle/phi/api/include/context_pool.h
@@ -97,7 +97,8 @@ namespace paddle {
  */
 PADDLE_API phi::Allocator* GetAllocator(const phi::Place& place);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 /**
  * Get the current CUDA stream for the passed CUDA device.
  */
diff --git a/paddle/phi/api/include/tensor.h b/paddle/phi/api/include/tensor.h
index b2c687a1f448d..30f087d22c559 100644
--- a/paddle/phi/api/include/tensor.h
+++ b/paddle/phi/api/include/tensor.h
@@ -401,7 +401,8 @@ class PADDLE_API Tensor final {
    */
   void set_impl(std::shared_ptr<phi::TensorBase>&& impl);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   /**
    * @brief Get the stream where the tensor is currently located
    * This is a deprecated method and may be removed in the future!
diff --git a/paddle/phi/api/lib/context_pool.cc b/paddle/phi/api/lib/context_pool.cc
index b3badfdb94ff7..f958ea2a96039 100644
--- a/paddle/phi/api/lib/context_pool.cc
+++ b/paddle/phi/api/lib/context_pool.cc
@@ -19,7 +19,8 @@ limitations under the License. */
 #include "paddle/phi/core/allocator.h"
 #include "paddle/phi/core/enforce.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/phi/core/cuda_stream.h"
 #endif
 
@@ -63,7 +64,8 @@ PADDLE_API phi::Allocator* GetAllocator(const phi::Place& place) {
   return const_cast<phi::Allocator*>(&dev_ctx->GetAllocator());
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PADDLE_API phi::CUDAStream* GetCurrentCUDAStream(const phi::Place& place) {
   PADDLE_ENFORCE_EQ(place.GetType(),
                     phi::AllocationType::GPU,
diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc
index eac1d34ada374..12c13cba89fb0 100644
--- a/paddle/phi/api/lib/data_transform.cc
+++ b/paddle/phi/api/lib/data_transform.cc
@@ -93,7 +93,8 @@ phi::DenseTensor CastDataType(const Context& dev_ctx,
   }
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 phi::DenseTensor CastDataType(const phi::GPUContext& dev_ctx,
                               const phi::DenseTensor& tensor,
                               DataType dtype) {
@@ -135,7 +136,8 @@ inline phi::DenseTensor TransDataType(const phi::DenseTensor& tensor,
   if (tensor.place().GetType() == phi::AllocationType::CPU) {
     auto* dev_ctx = static_cast<phi::CPUContext*>(pool.Get(tensor.place()));
     return CastDataType(*dev_ctx, tensor, dtype);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   } else if (tensor.place().GetType() == phi::AllocationType::GPU) {
     auto* dev_ctx = static_cast<phi::GPUContext*>(pool.Get(tensor.place()));
     return CastDataType(*dev_ctx, tensor, dtype);
@@ -153,7 +155,8 @@ inline phi::DenseTensor TransDataPlace(const phi::DenseTensor& tensor,
           << " dst_place: " << dst_place;
 
   auto& pool = phi::DeviceContextPool::Instance();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   // NOTE(yy): TransDataPlace should wait for computation of input.
   if (tensor.place().GetType() != phi::AllocationType::GPUPINNED) {
     pool.Get(tensor.place())->Wait();
diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc
index 4a0b8426fa8d8..a11dbf445ab9b 100644
--- a/paddle/phi/api/lib/tensor.cc
+++ b/paddle/phi/api/lib/tensor.cc
@@ -359,7 +359,8 @@ void Tensor::set_impl(std::shared_ptr<phi::TensorBase> &&impl) {
   impl_ = std::move(impl);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 gpuStream_t Tensor::stream() const {
   int device_id = phi::backends::gpu::GetCurrentDeviceId();
   auto *gpu_context = DeviceContextPool::Instance().Get<AllocationType::GPU>(
diff --git a/paddle/phi/api/lib/tensor_utils.cc b/paddle/phi/api/lib/tensor_utils.cc
index c96cf57f1ce6c..49e2a2698f4dd 100644
--- a/paddle/phi/api/lib/tensor_utils.cc
+++ b/paddle/phi/api/lib/tensor_utils.cc
@@ -17,7 +17,8 @@ limitations under the License. */
 #include "paddle/phi/api/lib/api_registry.h"
 #include "paddle/phi/core/dense_tensor.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #ifdef PADDLE_WITH_CUDA
 #include <cuda_runtime.h>
 #elif defined(PADDLE_WITH_MUSA)
@@ -32,7 +33,8 @@ namespace paddle {
 PD_REGISTER_API(from_blob)
 
 phi::Place GetPlaceFromPtr(void* data) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #ifdef PADDLE_WITH_CUDA
 #if CUDA_VERSION >= 10000
   cudaPointerAttributes attr;
diff --git a/paddle/phi/api/profiler/event.h b/paddle/phi/api/profiler/event.h
index 3a789cad101f4..eaf8afbe03a65 100644
--- a/paddle/phi/api/profiler/event.h
+++ b/paddle/phi/api/profiler/event.h
@@ -31,7 +31,8 @@ limitations under the License. */
 #include <musa_runtime.h>
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/phi/core/cuda_stream.h"
 #endif
 
@@ -65,7 +66,8 @@ class Event {
   void set_name(std::string name) { name_ = name; }
   void set_role(EventRole role) { role_ = role; }
   std::string attr() const { return attr_; }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #ifndef PADDLE_WITH_CUPTI
   gpuEvent_t event() const { return event_; }
   int device() const { return device_; }
@@ -84,7 +86,8 @@ class Event {
   int64_t cpu_ns_;
   bool visited_status_{false};
   std::string attr_;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #ifdef PADDLE_WITH_CUPTI
   int64_t gpu_ns_ = 0;
 
@@ -140,7 +143,8 @@ class MemEvent {
 };
 
 class CudaEvent {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
  public:
   CudaEvent() {
diff --git a/paddle/phi/backends/context_pool.cc b/paddle/phi/backends/context_pool.cc
index 372edd66e50d9..e3b28fb2c0871 100644
--- a/paddle/phi/backends/context_pool.cc
+++ b/paddle/phi/backends/context_pool.cc
@@ -21,7 +21,8 @@ limitations under the License. */
 
 namespace phi {
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 bool allow_tf32_cublas = true;
 void SetAllowTF32Cublas(bool active) { allow_tf32_cublas = active; }
 bool AllowTF32Cublas() { return allow_tf32_cublas; }
diff --git a/paddle/phi/backends/context_pool.h b/paddle/phi/backends/context_pool.h
index efce5aac61a71..966f338b7337c 100644
--- a/paddle/phi/backends/context_pool.h
+++ b/paddle/phi/backends/context_pool.h
@@ -27,7 +27,8 @@ limitations under the License. */
 
 namespace phi {
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 void SetAllowTF32Cublas(bool active);
 /*Get the global variable allow_tf32_cublas value*/
 bool AllowTF32Cublas();
@@ -46,7 +47,8 @@ struct DefaultDeviceContextType<phi::CPUPlace> {
   using TYPE = phi::CPUContext;
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 template <>
 struct DefaultDeviceContextType<phi::GPUPlace> {
   using TYPE = phi::GPUContext;
diff --git a/paddle/phi/backends/device_code.cc b/paddle/phi/backends/device_code.cc
index a3c85c0a1a15f..ac16a69aa7bee 100644
--- a/paddle/phi/backends/device_code.cc
+++ b/paddle/phi/backends/device_code.cc
@@ -78,7 +78,8 @@ DeviceCodePool::DeviceCodePool(const std::vector<phi::Place>& places) {
   }
   for (auto& p : set) {
     if (p.GetType() == phi::AllocationType::GPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       device_codes_.emplace(p, DeviceCodeMap());
 #else
       PADDLE_THROW(phi::errors::PreconditionNotMet(
@@ -88,12 +89,14 @@ DeviceCodePool::DeviceCodePool(const std::vector<phi::Place>& places) {
     }
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   GPUDeviceCode::CheckAvailableStatus();
 #endif
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #ifdef PADDLE_WITH_HIP
 static bool CheckCUDADriverResult(hipError_t result,
                                   std::string caller,
@@ -138,8 +141,7 @@ void GPUDeviceCode::CheckAvailableStatus() {
   hiprtcResult nvrtc_result =
       dynload::hiprtcVersion(&nvrtc_major, &nvrtc_minor);
 #elif defined(PADDLE_WITH_MUSA)
-  mtrtcResult nvrtc_result =
-      dynload::mtrtcVersion(&nvrtc_major, &nvrtc_minor);
+  mtrtcResult nvrtc_result = dynload::mtrtcVersion(&nvrtc_major, &nvrtc_minor);
 #else
   nvrtcResult nvrtc_result = dynload::nvrtcVersion(&nvrtc_major, &nvrtc_minor);
 #endif
@@ -592,6 +594,8 @@ bool GPUDeviceCode::CheckNVRTCResult(mtrtcResult result, std::string function) {
         << " > failed: " << dynload::mtrtcGetErrorString(result);
     return false;
   }
+  return true;
+}
 #else
 bool GPUDeviceCode::CheckNVRTCResult(nvrtcResult result, std::string function) {
   if (result != NVRTC_SUCCESS) {
@@ -600,9 +604,9 @@ bool GPUDeviceCode::CheckNVRTCResult(nvrtcResult result, std::string function) {
         << " > failed: " << dynload::nvrtcGetErrorString(result);
     return false;
   }
-#endif
   return true;
 }
 #endif
+#endif
 
 }  // namespace phi
diff --git a/paddle/phi/backends/device_code.h b/paddle/phi/backends/device_code.h
index f9cb49dc91474..62aea0c1c6ffb 100644
--- a/paddle/phi/backends/device_code.h
+++ b/paddle/phi/backends/device_code.h
@@ -27,8 +27,8 @@ limitations under the License. */
 #include "paddle/phi/backends/dynload/nvrtc.h"
 #endif
 #ifdef PADDLE_WITH_MUSA
-#include "paddle/phi/backends/dynload/musartc.h"
 #include "paddle/phi/backends/dynload/musa_driver.h"
+#include "paddle/phi/backends/dynload/musartc.h"
 #endif
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/backends/dynload/hiprtc.h"
@@ -52,7 +52,8 @@ class DeviceCode {
   std::string kernel_;
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 class GPUDeviceCode : public DeviceCode {
  public:
   explicit GPUDeviceCode(const Place& place,
diff --git a/paddle/phi/backends/device_memory_aligment.h b/paddle/phi/backends/device_memory_aligment.h
index 3a430132d9325..1c47183f0f123 100644
--- a/paddle/phi/backends/device_memory_aligment.h
+++ b/paddle/phi/backends/device_memory_aligment.h
@@ -36,7 +36,8 @@ inline size_t Alignment(size_t size,
     if (place.GetType() == phi::AllocationType::CPU) {
       alignment = phi::backends::cpu::CpuMinChunkSize();
     } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       alignment = phi::backends::gpu::GpuMinChunkSize();
 #elif defined(PADDLE_WITH_XPU)
       alignment = phi::backends::xpu::XPUMinChunkSize();
diff --git a/paddle/phi/backends/dynload/musparse.h b/paddle/phi/backends/dynload/musparse.h
index 428cb938431f3..595e6d490d5e4 100644
--- a/paddle/phi/backends/dynload/musparse.h
+++ b/paddle/phi/backends/dynload/musparse.h
@@ -41,27 +41,27 @@ extern void *musparse_dso_handle;
   extern DynLoad__##__name __name
 
 #if defined(PADDLE_WITH_MUSA)
-#define MUSPARSE_ROUTINE_EACH(__macro)    \
-  __macro(musparseSetStream);             \
-  __macro(musparseCreateMatDescr);        \
-  __macro(musparseSnnz);                  \
-  __macro(musparseDnnz);                  \
-  __macro(musparseSetMatType);            \
-  __macro(musparseSetMatIndexBase);       \
-  __macro(musparseCreateCsr);             \
-  __macro(musparseCreateCoo);             \
-  __macro(musparseCreateDnMat);           \
-  __macro(musparseCreateDnVec);           \
-  __macro(musparseSpMM);                  \
-  __macro(musparseDestroySpMat);          \
-  __macro(musparseDestroyDnMat);          \
-  __macro(musparseDestroyDnVec);          \
-  __macro(musparseSpMV);                  \
-  __macro(musparseSDDMM_bufferSize);      \
-  __macro(musparseSDDMM_preprocess);      \
-  __macro(musparseSDDMM);                 \
-  __macro(musparseDnMatSetStridedBatch);  \
-  __macro(musparseCooSetStridedBatch);    \
+#define MUSPARSE_ROUTINE_EACH(__macro)   \
+  __macro(musparseSetStream);            \
+  __macro(musparseCreateMatDescr);       \
+  __macro(musparseSnnz);                 \
+  __macro(musparseDnnz);                 \
+  __macro(musparseSetMatType);           \
+  __macro(musparseSetMatIndexBase);      \
+  __macro(musparseCreateCsr);            \
+  __macro(musparseCreateCoo);            \
+  __macro(musparseCreateDnMat);          \
+  __macro(musparseCreateDnVec);          \
+  __macro(musparseSpMM);                 \
+  __macro(musparseDestroySpMat);         \
+  __macro(musparseDestroyDnMat);         \
+  __macro(musparseDestroyDnVec);         \
+  __macro(musparseSpMV);                 \
+  __macro(musparseSDDMM_bufferSize);     \
+  __macro(musparseSDDMM_preprocess);     \
+  __macro(musparseSDDMM);                \
+  __macro(musparseDnMatSetStridedBatch); \
+  __macro(musparseCooSetStridedBatch);   \
   __macro(musparseCsrSetStridedBatch);
 
 MUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MUSPARSE_WRAP)
diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc
index 3fd1a77fbe18e..615ab755f8a78 100644
--- a/paddle/phi/backends/gpu/gpu_context.cc
+++ b/paddle/phi/backends/gpu/gpu_context.cc
@@ -52,7 +52,6 @@ limitations under the License. */
 #endif  // !defined(__APPLE__) && defined(PADDLE_WITH_MCCL)
 #endif  // PADDLE_WITH_MUSA
 
-
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/backends/dynload/miopen.h"
 #include "paddle/phi/backends/dynload/rocblas.h"
@@ -161,16 +160,17 @@ static void StreamCallbackFunc(gpuStream_t stream,
 #if MUSA_VERSION >= 10000
     static void StreamCallbackFunc(void* user_data)
 #else
-    static void
-    StreamCallbackFunc(cudaStream_t stream, cudaError_t status, void* user_data)
+    static void StreamCallbackFunc(cudaStream_t stream,
+                                   cudaError_t status,
+                                   void* user_data)
 #endif
 #endif
 
 #ifdef PADDLE_WITH_CUDA
 #if CUDA_VERSION >= 10000
-    static void CUDART_CB StreamCallbackFunc(void* user_data)
+        static void CUDART_CB StreamCallbackFunc(void* user_data)
 #else
-    static void CUDART_CB
+        static void CUDART_CB
     StreamCallbackFunc(cudaStream_t stream, cudaError_t status, void* user_data)
 #endif
 #endif
@@ -576,7 +576,7 @@ struct GPUContext::Impl {
     }
 #endif  // !defined(_WIN32)
 
-#else   // PADDLE_WITH_MUSA
+#else  // PADDLE_WITH_MUSA
     cudaError_t e_sync = cudaSuccess;
 #if !defined(_WIN32)
     e_sync = cudaStreamSynchronize(stream());
@@ -768,7 +768,8 @@ struct GPUContext::Impl {
   }
 
   void WaitStreamCallback() const {
-#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_CUDA) || \
+    defined(PADDLE_WITH_MUSA)
     phi::backends::gpu::GpuStreamSync(stream());
 #endif
     {
@@ -1112,7 +1113,8 @@ void GPUContext::SetDnnAttr(const std::string& attr_name, Attribute attr) {
 
 void GPUContext::ClearDnnAttr() { return impl_->ClearDnnAttr(); }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 GPUPinnedContext::GPUPinnedContext() {
   eigen_device_.reset(new Eigen::DefaultDevice());
 }
diff --git a/paddle/phi/backends/gpu/gpu_context.h b/paddle/phi/backends/gpu/gpu_context.h
index 5ce1d87e5f36a..ce92612304cda 100644
--- a/paddle/phi/backends/gpu/gpu_context.h
+++ b/paddle/phi/backends/gpu/gpu_context.h
@@ -15,8 +15,8 @@ limitations under the License. */
 
 #pragma once
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
-    defined(PADDLE_WITH_XPU_KP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_XPU_KP)
 
 #include <array>
 #include <functional>
@@ -282,7 +282,8 @@ using GPUDNNContext = GPUContext;
 // because we want to implement a KPS-based kernel and make it run
 // on GPU and XPU at the same time, so we need KPSContext when registering
 // KPS Kernel. Note: XPU and GPU cannot be compiled at the same time!
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 using KPSContext = GPUContext;
 #endif
 
@@ -293,7 +294,8 @@ struct DefaultDevice;
 }  // namespace Eigen
 
 namespace phi {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 // Currently, GPUPinnedContext is only used to data copying.
 class GPUPinnedContext
     : public DeviceContext,
diff --git a/paddle/phi/backends/gpu/gpu_decls.h b/paddle/phi/backends/gpu/gpu_decls.h
index 222d6eefce283..65ec52c5476d0 100644
--- a/paddle/phi/backends/gpu/gpu_decls.h
+++ b/paddle/phi/backends/gpu/gpu_decls.h
@@ -33,9 +33,15 @@ namespace phi {
 
 DECLARE_TYPE_FOR_GPU(gpuStream_t, cudaStream_t, hipStream_t, musaStream_t);
 DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t, musaEvent_t);
-DECLARE_TYPE_FOR_GPU(sparseHandle_t, cusparseHandle_t, rocsparse_handle, musparseHandle_t);
+DECLARE_TYPE_FOR_GPU(sparseHandle_t,
+                     cusparseHandle_t,
+                     rocsparse_handle,
+                     musparseHandle_t);
 DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t, mudnnHandle_t);
-DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle, mublasHandle_t);
+DECLARE_TYPE_FOR_GPU(blasHandle_t,
+                     cublasHandle_t,
+                     rocblas_handle,
+                     mublasHandle_t);
 #undef DECLARE_TYPE_FOR_GPU
 
 #ifndef PADDLE_WITH_MUSA
diff --git a/paddle/phi/backends/gpu/gpu_device_function.h b/paddle/phi/backends/gpu/gpu_device_function.h
index 5c0c475b140ff..a5728c25012f9 100644
--- a/paddle/phi/backends/gpu/gpu_device_function.h
+++ b/paddle/phi/backends/gpu/gpu_device_function.h
@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/backends/gpu/rocm/rocm_device_function.h"
diff --git a/paddle/phi/backends/gpu/gpu_dnn.h b/paddle/phi/backends/gpu/gpu_dnn.h
index dfb13e29dbf89..b67010344a64e 100644
--- a/paddle/phi/backends/gpu/gpu_dnn.h
+++ b/paddle/phi/backends/gpu/gpu_dnn.h
@@ -14,7 +14,8 @@
 
 #pragma once
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/backends/gpu/rocm/miopen_desc.h"
diff --git a/paddle/phi/backends/gpu/gpu_helper.h b/paddle/phi/backends/gpu/gpu_helper.h
index 8afa826408cb7..456681bb2b5d6 100644
--- a/paddle/phi/backends/gpu/gpu_helper.h
+++ b/paddle/phi/backends/gpu/gpu_helper.h
@@ -13,7 +13,8 @@
 // limitations under the License.
 
 #pragma once
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/backends/gpu/rocm/rocm_helper.h"
diff --git a/paddle/phi/backends/gpu/gpu_info.h b/paddle/phi/backends/gpu/gpu_info.h
index 2d1b7c1a98f27..70b4ebd21294e 100644
--- a/paddle/phi/backends/gpu/gpu_info.h
+++ b/paddle/phi/backends/gpu/gpu_info.h
@@ -11,7 +11,8 @@ limitations under the License. */
 
 #pragma once
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
 #include <stddef.h>
 
diff --git a/paddle/phi/backends/gpu/gpu_launch_config.h b/paddle/phi/backends/gpu/gpu_launch_config.h
index 675353e011498..5080a714bebb3 100644
--- a/paddle/phi/backends/gpu/gpu_launch_config.h
+++ b/paddle/phi/backends/gpu/gpu_launch_config.h
@@ -16,7 +16,8 @@
 
 #pragma once
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
 #ifdef PADDLE_WITH_CUDA
 #include <cuda_runtime.h>
diff --git a/paddle/phi/backends/gpu/gpu_resources.cc b/paddle/phi/backends/gpu/gpu_resources.cc
index 98ea78a3e5109..2bee37b300258 100644
--- a/paddle/phi/backends/gpu/gpu_resources.cc
+++ b/paddle/phi/backends/gpu/gpu_resources.cc
@@ -33,8 +33,6 @@
 #endif  // !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
 #endif  // PADDLE_WITH_CUDA
 
-
-
 #ifdef PADDLE_WITH_MUSA
 #include "paddle/phi/backends/dynload/mublas.h"
 #include "paddle/phi/backends/dynload/mudnn.h"
diff --git a/paddle/phi/backends/gpu/gpu_types.h b/paddle/phi/backends/gpu/gpu_types.h
index 9ff900a1b96ff..00c0bdf6c545b 100644
--- a/paddle/phi/backends/gpu/gpu_types.h
+++ b/paddle/phi/backends/gpu/gpu_types.h
@@ -17,7 +17,8 @@
 #include "paddle/phi/backends/gpu/forwards.h"
 #include "paddle/phi/backends/gpu/gpu_decls.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/backends/dynload/miopen.h"
@@ -46,8 +47,14 @@ namespace phi {
 #endif  // PADDLE_WITH_CUDA
 
 DECLARE_TYPE_FOR_GPU(gpuError_t, cudaError_t, hipError_t, musaError_t);
-DECLARE_TYPE_FOR_GPU(gpuMemcpyKind, cudaMemcpyKind, hipMemcpyKind, musaMemcpyKind);
-DECLARE_TYPE_FOR_GPU(gpuDeviceProp, cudaDeviceProp, hipDeviceProp_t, musaDeviceProp);
+DECLARE_TYPE_FOR_GPU(gpuMemcpyKind,
+                     cudaMemcpyKind,
+                     hipMemcpyKind,
+                     musaMemcpyKind);
+DECLARE_TYPE_FOR_GPU(gpuDeviceProp,
+                     cudaDeviceProp,
+                     hipDeviceProp_t,
+                     musaDeviceProp);
 #undef DECLARE_TYPE_FOR_GPU
 
 #ifndef PADDLE_WITH_MUSA
@@ -71,7 +78,6 @@ DECLARE_TYPE_FOR_GPU(dnnActivationMode_t,
 #undef DECLARE_TYPE_FOR_GPU
 #endif
 
-
 #ifdef PADDLE_WITH_HIP
 #define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \
   constexpr auto GPU_CV = ROCM_CV;
@@ -109,4 +115,5 @@ DECLARE_CONSTANT_FOR_GPU(gpuMemcpyDeviceToDevice,
 #undef DECLARE_CONSTANT_FOR_GPU
 }  // namespace phi
 
-#endif  // defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#endif  // defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) ||
+        // defined(PADDLE_WITH_MUSA )
diff --git a/paddle/phi/capi/lib/c_device_context.cc b/paddle/phi/capi/lib/c_device_context.cc
index 21df6c646cd3e..e6163e5f362d3 100644
--- a/paddle/phi/capi/lib/c_device_context.cc
+++ b/paddle/phi/capi/lib/c_device_context.cc
@@ -35,7 +35,8 @@ PD_Stream PD_DeviceContextGetStream(const PD_DeviceContext* ctx,
         reinterpret_cast<const phi::CustomContext*>(ctx)->stream());
   } else if (dev_ctx_type == phi::AllocationType::CPU) {
     return nullptr;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   } else if (dev_ctx_type == phi::AllocationType::GPU) {
     return reinterpret_cast<PD_Stream>(
         reinterpret_cast<const phi::GPUContext*>(ctx)->stream());
diff --git a/paddle/phi/capi/lib/c_kernel_context.cc b/paddle/phi/capi/lib/c_kernel_context.cc
index 7df79117dbae5..63c4085eface4 100644
--- a/paddle/phi/capi/lib/c_kernel_context.cc
+++ b/paddle/phi/capi/lib/c_kernel_context.cc
@@ -30,7 +30,8 @@ PD_DeviceContext* PD_KernelContextGetDeviceContext(PD_KernelContext* ctx) {
   } else if (dev_ctx_type == phi::AllocationType::CPU) {
     return reinterpret_cast<PD_DeviceContext*>(const_cast<phi::CPUContext*>(
         &kernel_context->GetDeviceContext<phi::CPUContext>()));
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   } else if (dev_ctx_type == phi::AllocationType::GPU) {
     return reinterpret_cast<PD_DeviceContext*>(const_cast<phi::GPUContext*>(
         &kernel_context->GetDeviceContext<phi::GPUContext>()));
diff --git a/paddle/phi/common/backend.h b/paddle/phi/common/backend.h
index 3d0bf86c2bca6..342e0a3ebe5ce 100644
--- a/paddle/phi/common/backend.h
+++ b/paddle/phi/common/backend.h
@@ -138,7 +138,8 @@ inline Backend StringToBackend(const char* backend_cstr) {
   } else if (s == std::string("GPUDNN")) {
     return Backend::GPUDNN;
   } else if (s == std::string("KPS")) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     // NOTE(chenweihang) KPS is not yet a complete backend, and it still needs
     // to be converted
     // to GPU in the GPU environment
diff --git a/paddle/phi/common/bfloat16.h b/paddle/phi/common/bfloat16.h
index 73714adbf450c..d553ac9b1ff0c 100644
--- a/paddle/phi/common/bfloat16.h
+++ b/paddle/phi/common/bfloat16.h
@@ -34,7 +34,6 @@
 #include <musa_bf16.h>
 #endif
 
-
 #if defined(__CUDACC__) && CUDA_VERSION >= 11000
 #define PADDLE_CUDA_BF16
 #include <cuda_bf16.h>
diff --git a/paddle/phi/common/complex.h b/paddle/phi/common/complex.h
index f4c5be53660aa..c4b8ad9055f87 100644
--- a/paddle/phi/common/complex.h
+++ b/paddle/phi/common/complex.h
@@ -42,7 +42,8 @@
 #define PADDLE_ALIGN(x) __declspec(align(x))
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 // todo
 #define PADDLE_WITH_CUDA_OR_HIP_COMPLEX
 #endif
@@ -67,7 +68,8 @@ struct PADDLE_ALIGN(sizeof(T) * 2) complex {
 
   HOSTDEVICE complex(T real, T imag) : real(real), imag(imag) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
   template <typename T1>
   HOSTDEVICE inline explicit complex(const thrust::complex<T1>& c) {
diff --git a/paddle/phi/common/float16.h b/paddle/phi/common/float16.h
index 5b53828251e40..75fea3d88ab0c 100644
--- a/paddle/phi/common/float16.h
+++ b/paddle/phi/common/float16.h
@@ -91,8 +91,10 @@ struct PADDLE_ALIGN(2) float16 {
 // Constructors
 #ifdef PADDLE_CUDA_FP16
   HOSTDEVICE inline explicit float16(const half& h) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
-#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || CUDA_VERSION >= 9000
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
+    CUDA_VERSION >= 9000
     x = reinterpret_cast<__half_raw*>(const_cast<half*>(&h))->x;
 #else
     x = h.x;
@@ -110,8 +112,9 @@ struct PADDLE_ALIGN(2) float16 {
 #endif
 
   HOSTDEVICE inline explicit float16(float val) {
-#if defined(PADDLE_CUDA_FP16) && \
-    (defined(__HIPCC__) || defined(__MUSACC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300))
+#if defined(PADDLE_CUDA_FP16) &&                  \
+    (defined(__HIPCC__) || defined(__MUSACC__) || \
+     (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300))
     half tmp = __float2half(val);
     x = *reinterpret_cast<uint16_t*>(&tmp);
 
@@ -153,7 +156,8 @@ struct PADDLE_ALIGN(2) float16 {
 // Assignment operators
 #ifdef PADDLE_CUDA_FP16
   HOSTDEVICE inline float16& operator=(const half& rhs) {
-#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || CUDA_VERSION >= 9000
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
+    CUDA_VERSION >= 9000
     x = reinterpret_cast<__half_raw*>(const_cast<half*>(&rhs))->x;
 #else
     x = rhs.x;
@@ -227,7 +231,8 @@ struct PADDLE_ALIGN(2) float16 {
 // Conversion operators
 #ifdef PADDLE_CUDA_FP16
   HOSTDEVICE inline half to_half() const {
-#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || CUDA_VERSION >= 9000
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
+    CUDA_VERSION >= 9000
     __half_raw h;
     h.x = x;
     return half(h);
@@ -246,8 +251,9 @@ struct PADDLE_ALIGN(2) float16 {
 #endif
 
   HOSTDEVICE inline operator float() const {
-#if defined(PADDLE_CUDA_FP16) && \
-    (defined(__HIPCC__) || defined(__MUSACC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300))
+#if defined(PADDLE_CUDA_FP16) &&                  \
+    (defined(__HIPCC__) || defined(__MUSACC__) || \
+     (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300))
     half tmp = *reinterpret_cast<const half*>(this);
     return __half2float(tmp);
 
diff --git a/paddle/phi/common/memory_utils.cc b/paddle/phi/common/memory_utils.cc
index 6dc6c1cba468d..cf4c3ca12869d 100644
--- a/paddle/phi/common/memory_utils.cc
+++ b/paddle/phi/common/memory_utils.cc
@@ -69,7 +69,8 @@ int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id) {
                                                               dev_id);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 void GpuMemoryUsage(size_t* available, size_t* total) {
   return MemoryUtils::Instance().GpuMemoryUsage(available, total);
 }
diff --git a/paddle/phi/common/memory_utils.h b/paddle/phi/common/memory_utils.h
index 3baf7bbe35624..0aa0c745501ec 100644
--- a/paddle/phi/common/memory_utils.h
+++ b/paddle/phi/common/memory_utils.h
@@ -118,7 +118,8 @@ struct MemoryInterface {
   int64_t (*device_memory_stat_current_value)(const std::string& stat_type,
                                               int dev_id);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   /**
    * @brief get the memory usage of current GPU device.
    *
@@ -271,7 +272,8 @@ class MemoryUtils {
     return memory_method_->device_memory_stat_current_value(stat_type, dev_id);
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   void GpuMemoryUsage(size_t* available, size_t* total) {
     CheckMemoryMethod();
     PADDLE_ENFORCE_NOT_NULL(
@@ -372,7 +374,8 @@ void Copy(const Place& dst_place,
 
 int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 void GpuMemoryUsage(size_t* available, size_t* total);
 #endif
 
diff --git a/paddle/phi/common/place.cc b/paddle/phi/common/place.cc
index 0f009806e8c53..0f8d7a173ad52 100644
--- a/paddle/phi/common/place.cc
+++ b/paddle/phi/common/place.cc
@@ -123,7 +123,8 @@ static int8_t GetCorrectDeviceIdByPlaceType(
   switch (place_type) {
     case paddle::PlaceType::kCPU:
       return 0;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     case paddle::PlaceType::kGPU:
       return phi::backends::gpu::GetCurrentDeviceId();
 #endif
@@ -169,7 +170,8 @@ bool operator==(PlaceType place_type, const Place &place) {
 
 GPUPlace DefaultGPUPlace() {
   return GPUPlace(
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       phi::backends::gpu::GetCurrentDeviceId());
 #else
       0);
diff --git a/paddle/phi/core/compat/convert_utils.cc b/paddle/phi/core/compat/convert_utils.cc
index 9792f64c5c46d..24eb8115e970f 100644
--- a/paddle/phi/core/compat/convert_utils.cc
+++ b/paddle/phi/core/compat/convert_utils.cc
@@ -57,7 +57,8 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) {
   switch (backend) {
     case phi::Backend::CPU:
       return phi::CPUPlace();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     case phi::Backend::GPU:
       return phi::GPUPlace(
           set_device_id ? phi::backends::gpu::GetCurrentDeviceId() : 0);
@@ -66,7 +67,8 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) {
     case phi::Backend::ONEDNN:
       return phi::CPUPlace();
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     case phi::Backend::GPUDNN:
       return phi::GPUPlace(
           set_device_id ? phi::backends::gpu::GetCurrentDeviceId() : 0);
@@ -77,7 +79,8 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) {
           set_device_id ? phi::backends::xpu::GetXPUCurrentDeviceId() : 0);
 #endif
     case phi::Backend::KPS:
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       return phi::GPUPlace(
           set_device_id ? phi::backends::gpu::GetCurrentDeviceId() : 0);
 #elif defined(PADDLE_WITH_XPU_KP)
diff --git a/paddle/phi/core/enforce.h b/paddle/phi/core/enforce.h
index 6916703a9684c..4adac10dd658f 100644
--- a/paddle/phi/core/enforce.h
+++ b/paddle/phi/core/enforce.h
@@ -86,7 +86,6 @@ using mudnnStatus_t = ::musa::dnn::Status;
 #endif  // __APPLE__
 #endif  // PADDLE_WITH_CUDA
 
-
 #ifdef PADDLE_WITH_MUSA
 #include "paddle/phi/backends/dynload/mublas.h"
 #include "paddle/phi/backends/dynload/mudnn.h"
@@ -98,7 +97,6 @@ using mudnnStatus_t = ::musa::dnn::Status;
 #endif  // __APPLE__
 #endif  // PADDLE_WITH_MUSA
 
-
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/backends/dynload/hipfft.h"
 #include "paddle/phi/backends/dynload/hiprand.h"
@@ -114,7 +112,8 @@ using mudnnStatus_t = ::musa::dnn::Status;
 // Note: these headers for simplify demangle type string
 #include "paddle/phi/core/type_defs.h"
 // Note: this header for simplify HIP and CUDA type string
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/phi/backends/gpu/gpu_types.h"
 #endif
 
@@ -1056,31 +1055,31 @@ inline std::string build_musa_error_msg(mcclResult_t mccl_result) {
 }
 #endif  // not(__APPLE__) and PADDLE_WITH_MCCL
 
-#define PADDLE_ENFORCE_GPU_SUCCESS(COND)                     \
-  do {                                                       \
-    auto __cond__ = (COND);                                  \
-    using __CUDA_STATUS_TYPE__ = decltype(__cond__);         \
-    constexpr auto __success_type__ =                        \
-        ::phi::enforce::details::ExternalApiType<            \
-            __CUDA_STATUS_TYPE__>::kSuccess;                 \
-    if (UNLIKELY(__cond__ != __success_type__)) {            \
-      auto __summary__ = phi::errors::External(              \
-          ::phi::enforce::build_musa_error_msg(__cond__));   \
-      __THROW_ERROR_INTERNAL__(__summary__);                 \
-    }                                                        \
+#define PADDLE_ENFORCE_GPU_SUCCESS(COND)                   \
+  do {                                                     \
+    auto __cond__ = (COND);                                \
+    using __CUDA_STATUS_TYPE__ = decltype(__cond__);       \
+    constexpr auto __success_type__ =                      \
+        ::phi::enforce::details::ExternalApiType<          \
+            __CUDA_STATUS_TYPE__>::kSuccess;               \
+    if (UNLIKELY(__cond__ != __success_type__)) {          \
+      auto __summary__ = phi::errors::External(            \
+          ::phi::enforce::build_musa_error_msg(__cond__)); \
+      __THROW_ERROR_INTERNAL__(__summary__);               \
+    }                                                      \
   } while (0)
 
-#define PADDLE_WARN_GPU_SUCCESS(COND)                        \
-  do {                                                       \
-    auto __cond__ = (COND);                                  \
-    using __CUDA_STATUS_TYPE__ = decltype(__cond__);         \
-    constexpr auto __success_type__ =                        \
-        ::phi::enforce::details::ExternalApiType<            \
-            __CUDA_STATUS_TYPE__>::kSuccess;                 \
-    if (UNLIKELY(__cond__ != __success_type__)) {            \
-      ::phi::enforce::ThrowWarnInternal(                     \
-          ::phi::enforce::build_musa_error_msg(__cond__));   \
-    }                                                        \
+#define PADDLE_WARN_GPU_SUCCESS(COND)                      \
+  do {                                                     \
+    auto __cond__ = (COND);                                \
+    using __CUDA_STATUS_TYPE__ = decltype(__cond__);       \
+    constexpr auto __success_type__ =                      \
+        ::phi::enforce::details::ExternalApiType<          \
+            __CUDA_STATUS_TYPE__>::kSuccess;               \
+    if (UNLIKELY(__cond__ != __success_type__)) {          \
+      ::phi::enforce::ThrowWarnInternal(                   \
+          ::phi::enforce::build_musa_error_msg(__cond__)); \
+    }                                                      \
   } while (0)
 
 #define PADDLE_ENFORCE_CUDA_LAUNCH_SUCCESS(OP)                              \
diff --git a/paddle/phi/core/flags.cc b/paddle/phi/core/flags.cc
index ebb54c8173917..ec6ac698cf567 100644
--- a/paddle/phi/core/flags.cc
+++ b/paddle/phi/core/flags.cc
@@ -14,7 +14,8 @@
 // limitations under the License.
 
 #include "paddle/phi/core/flags.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/phi/backends/gpu/cuda/cudnn_workspace_helper.h"
 #endif
 
@@ -120,7 +121,8 @@ PHI_DEFINE_EXPORTED_bool(
 
 // NOTE(zhiqiu): better to share the flags, otherwise we will have too many
 // flags.
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
 /**
  * CUDA related related FLAG
@@ -215,7 +217,8 @@ PHI_DEFINE_EXPORTED_bool(
     true,
     "Whether enable api kernel fallback to CPU one when not found");
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 /**
  * CUDNN related FLAG
  * Name: FLAGS_cudnn_deterministic
@@ -322,7 +325,8 @@ PHI_DEFINE_EXPORTED_bool(
     "batch_norm, default is False.");
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
 /**
  * NCCL related FLAG
@@ -541,8 +545,9 @@ PHI_DEFINE_EXPORTED_double(
 
 // NOTE(zhiqiu): better to share the flags, otherwise we will have too many
 // flags.
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
-    defined(PADDLE_WITH_CUSTOM_DEVICE) || defined(PADDLE_WITH_XPU)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) ||           \
+    defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_CUSTOM_DEVICE) || \
+    defined(PADDLE_WITH_XPU)
 
 /**
  * Memory related FLAG
@@ -785,7 +790,8 @@ PHI_DEFINE_EXPORTED_string(tracer_mkldnn_ops_off,
  * Example:
  * Note: Check kernel launch status after every kernel compute.
  */
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PHI_DEFINE_EXPORTED_bool(
     check_kernel_launch,
     false,
@@ -800,7 +806,8 @@ PHI_DEFINE_EXPORTED_bool(
  * Example:
  * Note: Disable cudnn in conv2d.
  */
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PHI_DEFINE_EXPORTED_bool(conv2d_disable_cudnn,
                          false,
                          "Disable cudnn in conv2d");
@@ -1127,7 +1134,8 @@ PHI_DEFINE_EXPORTED_bool(gpugraph_debug_gpu_memory,
  * Example:
  * Note: nccl blocking wait.
  */
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PHI_DEFINE_EXPORTED_bool(nccl_blocking_wait, false, "nccl blocking wait");
 #endif
 
diff --git a/paddle/phi/core/generator.cc b/paddle/phi/core/generator.cc
index 06ebdc1c0801c..d5e10d9c2d006 100644
--- a/paddle/phi/core/generator.cc
+++ b/paddle/phi/core/generator.cc
@@ -63,7 +63,8 @@ const std::shared_ptr<Generator>& DefaultXPUGenerator(int64_t device_id) {
 }
 
 const std::shared_ptr<Generator>& DefaultCUDAGenerator(int64_t device_id) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
   static int64_t num_cuda_devices = -1;
   static std::once_flag num_devices_init_flag;
@@ -278,7 +279,8 @@ uint64_t Generator::Random64() {
 
 std::pair<uint64_t, uint64_t> Generator::IncrementOffset(
     uint64_t increment_offset) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   std::lock_guard<std::mutex> lock(this->mu_);
   uint64_t cur_offset = this->state_.thread_offset;
   this->state_.thread_offset += increment_offset;
diff --git a/paddle/phi/core/kernel_factory.cc b/paddle/phi/core/kernel_factory.cc
index 5f9a40625fac9..dc0134da132dc 100644
--- a/paddle/phi/core/kernel_factory.cc
+++ b/paddle/phi/core/kernel_factory.cc
@@ -120,7 +120,8 @@ const Kernel& KernelFactory::SelectKernelWithGPUDNN(
     return empty_kernel;
   }
   KernelKey kernel_key = KernelKey(const_kernel_key);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (kernel_key.backend() == Backend::GPUDNN) {
     auto kernel_iter = iter->second.find(
         {Backend::GPUDNN, phi::DataLayout::ALL_LAYOUT, kernel_key.dtype()});
@@ -221,7 +222,8 @@ KernelResult KernelFactory::SelectKernelOrThrowError(
   KernelKey kernel_key = KernelKey(const_kernel_key.backend(),
                                    phi::DataLayout::ALL_LAYOUT,
                                    const_kernel_key.dtype());
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (kernel_key.backend() == Backend::GPUDNN) {
     auto kernel_iter = iter->second.find(
         {Backend::GPUDNN, phi::DataLayout::ALL_LAYOUT, kernel_key.dtype()});
diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h
index 984b28cf05316..ce795bf781577 100644
--- a/paddle/phi/core/kernel_registry.h
+++ b/paddle/phi/core/kernel_registry.h
@@ -60,7 +60,8 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
 #if defined(PADDLE_WITH_MKLDNN)
           || arg_type == std::type_index(typeid(const OneDNNContext&))
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
           || arg_type == std::type_index(typeid(const GPUContext&))
 #elif defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP)
           || arg_type == std::type_index(typeid(const XPUContext&))
@@ -1401,7 +1402,8 @@ struct KernelRegistrar {
                                             meta_kernel_fn,        \
                                             BACKEND_LIST)
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #define _DEVICE GPU,
 #elif defined(PADDLE_WITH_XPU)
 #define _DEVICE XPU,
diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h
index d768ba85272aa..1aad3dd59611e 100644
--- a/paddle/phi/core/kernel_utils.h
+++ b/paddle/phi/core/kernel_utils.h
@@ -273,7 +273,8 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
   /* DeviceContext Helpers */
 
   PD_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(CPUContext);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   PD_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(GPUContext);
 #endif
 #ifdef PADDLE_WITH_XPU
diff --git a/paddle/phi/core/mixed_vector.cc b/paddle/phi/core/mixed_vector.cc
index aba6a0f7bfca2..778ec44c28ee3 100644
--- a/paddle/phi/core/mixed_vector.cc
+++ b/paddle/phi/core/mixed_vector.cc
@@ -33,7 +33,8 @@ template <typename T>
 void CopyToCPUHelper(std::vector<T> *cpu_,
                      phi::Allocator::AllocationPtr *gpu_,
                      size_t *gpu_memory_size_) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   // COPY GPU Data To CPU
   auto *dev_ctx = static_cast<phi::GPUContext *>(
       phi::DeviceContextPool::Instance().Get((*gpu_)->place()));
@@ -55,7 +56,8 @@ void CopyCPUDataToCUDAHelper(std::vector<T> *cpu_,
                              phi::Allocator::AllocationPtr *gpu_,
                              size_t *gpu_memory_size_,
                              const phi::Place &place) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   void *src = cpu_->data();
   *gpu_memory_size_ = cpu_->size() * sizeof(T);  // sizeof(T)
   (*gpu_) = memory_utils::Alloc(place, *gpu_memory_size_);
diff --git a/paddle/phi/core/string_tensor.cc b/paddle/phi/core/string_tensor.cc
index 1d95e16e2d9cc..c8b14db5615ed 100644
--- a/paddle/phi/core/string_tensor.cc
+++ b/paddle/phi/core/string_tensor.cc
@@ -114,7 +114,8 @@ void StringTensor::init_holder() {
   if (place.GetType() == phi::AllocationType::CPU) {
     std::memset(ptr, 0, bytes_size);
   } else if (place.GetType() == phi::AllocationType::GPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #ifdef PADDLE_WITH_HIP
     hipMemset(ptr, 0, bytes_size);
 #elif defined(PADDLE_WITH_MUSA)
diff --git a/paddle/phi/core/tensor_utils.cc b/paddle/phi/core/tensor_utils.cc
index b4a1343423103..e605673ea78e7 100644
--- a/paddle/phi/core/tensor_utils.cc
+++ b/paddle/phi/core/tensor_utils.cc
@@ -58,7 +58,8 @@ void Copy(const Context& dev_ctx,
 #ifdef PADDLE_WITH_MKLDNN
     dst->set_layout(src.layout());
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   } else if (dst_place.GetType() == AllocationType::GPU ||
              dst_place.GetType() == AllocationType::GPUPINNED) {
     dst_ptr = dev_ctx.Alloc(
@@ -99,7 +100,8 @@ void Copy(const Context& dev_ctx,
   if (src_place.GetType() == AllocationType::CPU &&
       dst_place.GetType() == AllocationType::CPU) {
     memory_utils::Copy(src_place, dst_ptr, src_place, src_ptr, size);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   } else if ((src_place.GetType() == AllocationType::CPU ||
               src_place.GetType() == AllocationType::GPUPINNED) &&  // NOLINT
              (dst_place.GetType() == AllocationType::CPU ||
@@ -386,7 +388,8 @@ template void Copy(const DeviceContext& dev_ctx,
                    bool blocking,
                    TensorArray* dst);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 template void Copy(const GPUContext& dev_ctx,
                    const DenseTensor& src,
                    Place dst_place,
@@ -468,7 +471,8 @@ void TensorFromVector(const std::vector<T>& src,
   if (dst_place.GetType() == AllocationType::CPU) {
     memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   else if (dst_place.GetType() == AllocationType::GPU) {  // NOLINT
     memory_utils::Copy(dst_place,
                        dst_ptr,
@@ -522,7 +526,8 @@ void TensorFromVector(const std::vector<bool>& src,
   if (dst_place.GetType() == AllocationType::CPU) {
     memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   else if (dst_place.GetType() == AllocationType::GPU) {  // NOLINT
     memory_utils::Copy(dst_place,
                        dst_ptr,
@@ -614,7 +619,8 @@ void TensorFromArray(const T* src,
   if (dst_place.GetType() == AllocationType::CPU) {
     memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   else if (dst_place.GetType() == AllocationType::GPU) {  // NOLINT
     memory_utils::Copy(dst_place,
                        dst_ptr,
@@ -714,7 +720,8 @@ void TensorToVector(const phi::DenseTensor& src,
   if (src.place().GetType() == AllocationType::CPU) {
     memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   else if (src.place().GetType() == AllocationType::GPU) {  // NOLINT
     memory_utils::Copy(dst_place,
                        dst_ptr,
@@ -756,7 +763,8 @@ void TensorToVector(const phi::DenseTensor& src,
   if (src.place().GetType() == AllocationType::CPU) {
     memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   else if (src.place().GetType() == AllocationType::GPU) {  // NOLINT
     memory_utils::Copy(dst_place,
                        dst_ptr,
diff --git a/paddle/phi/core/utils/type_info.cc b/paddle/phi/core/utils/type_info.cc
index 648ef5c587126..9a7dc398f2f7f 100644
--- a/paddle/phi/core/utils/type_info.cc
+++ b/paddle/phi/core/utils/type_info.cc
@@ -60,12 +60,13 @@ template class TypeInfoTraits<phi::TensorBase,
                               phi::distributed::auto_parallel::DistTensor>;
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
-    defined(PADDLE_WITH_XPU_KP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_XPU_KP)
 template class TypeInfoTraits<phi::DeviceContext, GPUContext>;
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 template class TypeInfoTraits<phi::DeviceContext, GPUPinnedContext>;
 #endif
 
diff --git a/paddle/phi/core/utils/visit_place.h b/paddle/phi/core/utils/visit_place.h
index 34a8fca61fbbe..874e4ebcaa37b 100644
--- a/paddle/phi/core/utils/visit_place.h
+++ b/paddle/phi/core/utils/visit_place.h
@@ -25,7 +25,8 @@ typename Visitor::result_type VisitPlace(const phi::Place& place,
                                          const Visitor& visitor) {
   switch (place.GetType()) {
     case phi::AllocationType::GPU: {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       phi::GPUPlace p(place.GetDeviceId());
       return visitor(p);
 #else
@@ -35,7 +36,8 @@ typename Visitor::result_type VisitPlace(const phi::Place& place,
 #endif
     }
     case phi::AllocationType::GPUPINNED: {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       phi::GPUPinnedPlace p;
       return visitor(p);
 #else
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index 88c273d6934ee..818a0698069bd 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -858,7 +858,8 @@ void CoalesceTensorInferMeta(const std::vector<const MetaTensor*>& input,
     size_of_dtype = phi::SizeOf(dtype);
   }
   if (config.is_runtime) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     int64_t numel = 0;
     for (size_t i = 0; i < input.size(); ++i) {
       const auto& dim = input[i]->dims();
diff --git a/paddle/phi/kernels/empty_kernel.cc b/paddle/phi/kernels/empty_kernel.cc
index 54449200ae4b2..3bc8ad34ac951 100644
--- a/paddle/phi/kernels/empty_kernel.cc
+++ b/paddle/phi/kernels/empty_kernel.cc
@@ -74,7 +74,8 @@ PD_REGISTER_KERNEL(empty_like,
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(empty,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/flatten_kernel.cc b/paddle/phi/kernels/flatten_kernel.cc
index 6b22ac7518179..0c6c9b3ec2d9a 100644
--- a/paddle/phi/kernels/flatten_kernel.cc
+++ b/paddle/phi/kernels/flatten_kernel.cc
@@ -75,7 +75,8 @@ PD_REGISTER_KERNEL(flatten,
                    int,
                    int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(flatten_infer,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/full_kernel.cc b/paddle/phi/kernels/full_kernel.cc
index 982b6a396c2a8..8817d577f7c8d 100644
--- a/paddle/phi/kernels/full_kernel.cc
+++ b/paddle/phi/kernels/full_kernel.cc
@@ -49,7 +49,8 @@ PD_REGISTER_KERNEL(full_batch_size_like,
                    bool) {
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
 }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(full_batch_size_like,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/funcs/blas/blas.h b/paddle/phi/kernels/funcs/blas/blas.h
index e2fe89382186a..b1e492d65b4a1 100644
--- a/paddle/phi/kernels/funcs/blas/blas.h
+++ b/paddle/phi/kernels/funcs/blas/blas.h
@@ -175,7 +175,8 @@ class Blas {
              T* c,
              const int* ldc) const;
 
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && \
+    !defined(PADDLE_WITH_MUSA)
   template <typename T>
   void MatMulWithHead(const phi::DenseTensor& mat_a,
                       const MatDescriptor& dim_a,
@@ -360,7 +361,8 @@ class Blas {
             T* B,
             int ldb) const;
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   template <typename T>
   void BatchedGETRF(int n, T** a, int* ipiv, int* info, int batch_size) const;
 
@@ -445,7 +447,8 @@ class BlasT : private Blas<DeviceContext> {
     Base()->template CSRMM<T>(args...);
   }
 
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && \
+    !defined(PADDLE_WITH_MUSA)
   template <typename... ARGS>
   void MatMulWithHead(ARGS... args) const {
     Base()->template MatMulWithHead<T>(args...);
@@ -543,7 +546,8 @@ class BlasT : private Blas<DeviceContext> {
     Base()->template TRSM<T>(args...);
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   template <typename... ARGS>
   void BatchedGETRF(ARGS... args) const {
     Base()->template BatchedGETRF<T>(args...);
diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.h b/paddle/phi/kernels/funcs/blas/blas_impl.h
index 5e4c058ee589b..f570a48eeb5b7 100644
--- a/paddle/phi/kernels/funcs/blas/blas_impl.h
+++ b/paddle/phi/kernels/funcs/blas/blas_impl.h
@@ -1452,7 +1452,8 @@ void Blas<phi::CPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
 }
 
 #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
-    !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) // @{ Group Blas MKLML: BatchedGEMMWithHead
+    !defined(PADDLE_WITH_HIP) &&                                \
+    !defined(PADDLE_WITH_MUSA)  // @{ Group Blas MKLML: BatchedGEMMWithHead
 template <>
 template <typename T>
 void Blas<phi::CPUContext>::BatchedGEMMWithHead(CBLAS_TRANSPOSE transA,
diff --git a/paddle/phi/kernels/funcs/detail/strided_memcpy.h b/paddle/phi/kernels/funcs/detail/strided_memcpy.h
index d731c4f89b751..707b203e9f49b 100644
--- a/paddle/phi/kernels/funcs/detail/strided_memcpy.h
+++ b/paddle/phi/kernels/funcs/detail/strided_memcpy.h
@@ -17,7 +17,8 @@ limitations under the License. */
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/device_context.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #endif
 
@@ -41,7 +42,8 @@ struct StridedMemcpyFunctor<T, 0> {
       auto& cpu_place = place;
       memory_utils::Copy(cpu_place, dst, cpu_place, src, sizeof(T));
     } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       auto& gpu_place = place;
       auto& cuda_ctx = reinterpret_cast<const phi::GPUContext&>(dev_ctx);
       memory_utils::Copy(
@@ -68,7 +70,8 @@ struct StridedMemcpyFunctor<T, 1> {
       memory_utils::Copy(
           cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim[0]);
     } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       auto& gpu_place = place;
       auto& cuda_ctx = reinterpret_cast<const phi::GPUContext&>(dev_ctx);
       memory_utils::Copy(gpu_place,
diff --git a/paddle/phi/kernels/funcs/dropout_impl.cu.h b/paddle/phi/kernels/funcs/dropout_impl.cu.h
index b4387e594d577..d31ab7f3c1c12 100644
--- a/paddle/phi/kernels/funcs/dropout_impl.cu.h
+++ b/paddle/phi/kernels/funcs/dropout_impl.cu.h
@@ -25,8 +25,8 @@ limitations under the License. */
 #include <hiprand_kernel.h>
 #endif
 #ifdef PADDLE_WITH_MUSA
-#include <musa_runtime.h>
 #include <murand_kernel.h>
+#include <musa_runtime.h>
 #endif
 
 #include "paddle/phi/kernels/funcs/dropout_impl_util.h"
diff --git a/paddle/phi/kernels/funcs/layer_norm_util.h b/paddle/phi/kernels/funcs/layer_norm_util.h
index 7a4ea0bb695bd..ba17748478fe7 100644
--- a/paddle/phi/kernels/funcs/layer_norm_util.h
+++ b/paddle/phi/kernels/funcs/layer_norm_util.h
@@ -36,7 +36,8 @@ struct RowwiseMean2D {
                   DenseTensor* vec);
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 template <typename T>
 class RowwiseMean2D<phi::GPUContext, T> {
  public:
@@ -93,7 +94,8 @@ struct ColwiseSum2D {
                   DenseTensor* vec);
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 template <typename T>
 class ColwiseSum2D<phi::GPUContext, T> {
  public:
diff --git a/paddle/phi/kernels/funcs/math_function.cc b/paddle/phi/kernels/funcs/math_function.cc
index b1bd0de25f088..4d2e25b3dc126 100644
--- a/paddle/phi/kernels/funcs/math_function.cc
+++ b/paddle/phi/kernels/funcs/math_function.cc
@@ -239,7 +239,8 @@ void set_constant(const phi::DeviceContext& context,
     return;
   }
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)// || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || \
+    defined(PADDLE_WITH_HIP)  // || defined(PADDLE_WITH_MUSA )
   // tensor->place().apply_visitor(func);
   phi::VisitPlace(tensor->place(), func);
 #elif defined(PADDLE_WITH_XPU)
diff --git a/paddle/phi/kernels/funcs/math_function.h b/paddle/phi/kernels/funcs/math_function.h
index bce782049b8a8..1fe2ffd9eb2e5 100644
--- a/paddle/phi/kernels/funcs/math_function.h
+++ b/paddle/phi/kernels/funcs/math_function.h
@@ -25,7 +25,8 @@ limitations under the License. */
 namespace phi {
 namespace funcs {
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 template <typename T>
 void BatchTranspose(T* output,
                     const T* input,
diff --git a/paddle/phi/kernels/funcs/pooling.h b/paddle/phi/kernels/funcs/pooling.h
index 3d95ef45eaae6..034ba0378ca3b 100644
--- a/paddle/phi/kernels/funcs/pooling.h
+++ b/paddle/phi/kernels/funcs/pooling.h
@@ -23,7 +23,8 @@ limitations under the License. */
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/core/macros.h"  // import FLT_MAX
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/phi/backends/gpu/gpu_decls.h"
 #endif
 
@@ -115,7 +116,8 @@ HOSTDEVICE inline int AdaptEndIndex(int ph, int input_size, int output_size) {
  * This is different from average pooling. So we rewrite the max_pool_grad:
  * MaxPool2dGradFunctor, MaxPool3dGradFunctor.
  */
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 template <typename PoolProcess, typename T>
 class Pool2dDirectCUDAFunctor {
  public:
@@ -211,7 +213,8 @@ class MaxPool2dGradFunctor {
                   DenseTensor* input_grad);
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 template <typename PoolProcess, typename T>
 class Pool3dDirectCUDAFunctor {
  public:
diff --git a/paddle/phi/kernels/funcs/select_impl.cu.h b/paddle/phi/kernels/funcs/select_impl.cu.h
index 32a4034890749..a99f0af2b1148 100644
--- a/paddle/phi/kernels/funcs/select_impl.cu.h
+++ b/paddle/phi/kernels/funcs/select_impl.cu.h
@@ -15,7 +15,8 @@
 #pragma once
 
 // CUDA and HIP use same api
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
diff --git a/paddle/phi/kernels/funcs/softmax.h b/paddle/phi/kernels/funcs/softmax.h
index 1198b80a9e879..60fcaf57cf631 100644
--- a/paddle/phi/kernels/funcs/softmax.h
+++ b/paddle/phi/kernels/funcs/softmax.h
@@ -37,7 +37,8 @@ class SoftmaxGradFunctor {
                   phi::DenseTensor* x_grad);
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 template <typename T, typename DeviceContext>
 class SoftmaxCUDNNFunctor {
  public:
diff --git a/paddle/phi/kernels/funcs/strided_memcpy.h b/paddle/phi/kernels/funcs/strided_memcpy.h
index 0e9dc896c3629..6fa28b4b2ae7d 100644
--- a/paddle/phi/kernels/funcs/strided_memcpy.h
+++ b/paddle/phi/kernels/funcs/strided_memcpy.h
@@ -56,8 +56,8 @@ inline void CopyWithContext(const Context& ctx,
                             const Place& src_place,
                             const void* src,
                             size_t num) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
-    defined(PADDLE_WITH_CUSTOM_DEVICE)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_CUSTOM_DEVICE)
   memory_utils::Copy(dst_place, dst, src_place, src, num, ctx.stream());
 #else
   PADDLE_THROW(
diff --git a/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_utils.h b/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_utils.h
index 9c5e336a9f148..f4db3137d0da8 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_utils.h
+++ b/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_utils.h
@@ -23,17 +23,18 @@
 #include <hiprand_kernel.h>
 #endif
 #ifdef PADDLE_WITH_MUSA
-#include <musa_runtime.h>
 #include <murand_kernel.h>
+#include <musa_runtime.h>
 #endif
 
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
 #ifdef PADDLE_WITH_HIP
 #define WARP_SIZE 64
-#else // MUSA & CUDA
+#else  // MUSA & CUDA
 #define WARP_SIZE 32
 #endif
 
diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
index 7dc5194e2c150..dbdeb6f02663d 100644
--- a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
@@ -1409,7 +1409,7 @@ PD_REGISTER_KERNEL(batch_norm_grad_raw,
     kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);  // bias_grad
   }
 }
-#else // CUDA 
+#else  // CUDA
 PD_REGISTER_KERNEL(batch_norm_grad,
                    GPU,
                    ALL_LAYOUT,
@@ -1447,7 +1447,7 @@ PD_REGISTER_KERNEL(batch_norm_double_grad,
                    phi::BatchNormDoubleGradKernel,
                    float,
                    double) {}
-#else // CUDA & MUSA
+#else  // CUDA & MUSA
 PD_REGISTER_KERNEL(batch_norm_double_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
index c3750162637b0..7b4c24d9f7e61 100644
--- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -949,8 +949,8 @@ void BatchNormKernel(const Context &ctx,
 //                 ctx.GetPlace())),
 //         static_cast<void *>(saved_variance->template mutable_data<
 //                             BatchNormParamType<T>>(ctx.GetPlace()))));
-#else // CUDA & MUSA
-      // const size_t CUDNN_PER_ACTIVATION_THRESHOLD = 131070;
+#else  // CUDA & MUSA
+       // const size_t CUDNN_PER_ACTIVATION_THRESHOLD = 131070;
       const bool use_native_kernel =
           ((x_dims.size() == 2 && N >= CUDNN_PER_ACTIVATION_THRESHOLD) ||
            (x_dims.size() == 3 && N >= CUDNN_SPATIAL_THRESHOLD_TRAIN));
@@ -1263,7 +1263,7 @@ PD_REGISTER_KERNEL(batch_norm,
     kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
   }
 }
-#else // CUDA & MUSA
+#else  // CUDA & MUSA
 PD_REGISTER_KERNEL(batch_norm,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu b/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu
index ccb55eceaec71..a1bd5b6fe619c 100644
--- a/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu
@@ -300,7 +300,7 @@ PD_REGISTER_KERNEL(cross_entropy_with_softmax_grad,
                    float,
                    double,
                    phi::dtype::float16) {}
-#else // CUDA
+#else  // CUDA
 #if CUDNN_VERSION_MIN(8, 1, 0)
 PD_REGISTER_KERNEL(cross_entropy_with_softmax_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/decode_jpeg_kernel.cu b/paddle/phi/kernels/gpu/decode_jpeg_kernel.cu
index 54b4292997394..fbaec83463951 100644
--- a/paddle/phi/kernels/gpu/decode_jpeg_kernel.cu
+++ b/paddle/phi/kernels/gpu/decode_jpeg_kernel.cu
@@ -12,7 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
+#if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_HIP) && \
+    !defined(PADDLE_WITH_MUSA)
 
 #include "paddle/phi/kernels/decode_jpeg_kernel.h"
 
diff --git a/paddle/phi/kernels/gpu/graph_reindex_kernel.cu b/paddle/phi/kernels/gpu/graph_reindex_kernel.cu
index d9d1149eb861e..6a9fe32273827 100644
--- a/paddle/phi/kernels/gpu/graph_reindex_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_reindex_kernel.cu
@@ -64,7 +64,7 @@ std::shared_ptr<phi::Allocation> FillHashTable(const Context& dev_ctx,
 #ifdef PADDLE_WITH_HIP
   int block = 256;
 #else
-  int block = 1024; // CUDA & MUSA
+  int block = 1024;  // CUDA & MUSA
 #endif
   int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
   int grid_tmp = (num_input + block - 1) / block;
@@ -141,7 +141,7 @@ void FillBufferHashTable(const Context& dev_ctx,
 #ifdef PADDLE_WITH_HIP
   int block = 256;
 #else
-  int block = 1024; // CUDA & MUSA
+  int block = 1024;  // CUDA & MUSA
 #endif
   int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
   int grid_tmp = (num_input + block - 1) / block;
@@ -180,7 +180,7 @@ void ResetBufferHashTable(const Context& dev_ctx,
 #ifdef PADDLE_WITH_HIP
   int block = 256;
 #else
-  int block = 1024; // CUDA & MUSA
+  int block = 1024;  // CUDA & MUSA
 #endif
   int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
   int grid_tmp = (unique_items->size() + block - 1) / block;
@@ -203,7 +203,7 @@ void ReindexSrc(const Context& dev_ctx,
 #ifdef PADDLE_WITH_HIP
   int block = 256;
 #else
-  int block = 1024; // CUDA & MUSA
+  int block = 1024;  // CUDA & MUSA
 #endif
   int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
   int grid_tmp = (num_edges + block - 1) / block;
@@ -303,7 +303,7 @@ void BufferReindex(const Context& dev_ctx,
 #ifdef PADDLE_WITH_HIP
   int block = 256;
 #else
-  int block = 1024; // CUDA & MUSA
+  int block = 1024;  // CUDA & MUSA
 #endif
   int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
   int grid_tmp = (num_edges + block - 1) / block;
diff --git a/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu b/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu
index 97079e6a67821..738bfcabb6080 100644
--- a/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu
@@ -23,8 +23,8 @@
 #include <hip/hip_runtime.h>
 #include <hiprand_kernel.h>
 #elif defined(PADDLE_WITH_MUSA)
-#include <musa_runtime.h>
 #include <murand_kernel.h>
+#include <musa_runtime.h>
 #else
 #include <cuda_runtime.h>
 #include <curand_kernel.h>
diff --git a/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
index 5e48ed565f353..c6a9759901470 100644
--- a/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
@@ -674,7 +674,7 @@ PD_REGISTER_KERNEL(instance_norm_double_grad,
                    double,
                    phi::dtype::float16,
                    phi::dtype::bfloat16) {}
-#else // CUDA
+#else  // CUDA
 PD_REGISTER_KERNEL(instance_norm_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/instance_norm_kernel.cu b/paddle/phi/kernels/gpu/instance_norm_kernel.cu
index 4a33ce8dafd1e..783429a5751a5 100644
--- a/paddle/phi/kernels/gpu/instance_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/instance_norm_kernel.cu
@@ -200,7 +200,7 @@ void InstanceNormKernel(const Context &dev_ctx,
       phi::dynload::miopenDestroyTensorDescriptor(data_desc_));
   PADDLE_ENFORCE_GPU_SUCCESS(
       phi::dynload::miopenDestroyTensorDescriptor(in_param_desc_));
-#elif defined(PADDLE_WITH_CUDA) 
+#elif defined(PADDLE_WITH_CUDA)
   PADDLE_ENFORCE_GPU_SUCCESS(
       phi::dynload::cudnnBatchNormalizationForwardTraining(
           handle,
@@ -255,7 +255,7 @@ PD_REGISTER_KERNEL(instance_norm,
                    double,
                    phi::dtype::float16,
                    phi::dtype::bfloat16) {}
-#else // CUDA
+#else  // CUDA
 PD_REGISTER_KERNEL(instance_norm,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/layer_norm_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
index 336a655d9c8fa..a54c15823d22d 100644
--- a/paddle/phi/kernels/gpu/layer_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
@@ -482,7 +482,8 @@ void LayerNormDirectCUDAFunctor<T, U>::operator()(gpuStream_t stream,
 
 template class LayerNormDirectCUDAFunctor<float, float>;
 template class LayerNormDirectCUDAFunctor<double, double>;
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_MUSA)) && !defined(PADDLE_WITH_HIP)
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_MUSA)) && \
+    !defined(PADDLE_WITH_HIP)
 template class LayerNormDirectCUDAFunctor<half, float>;
 #endif
 
@@ -689,7 +690,7 @@ PD_REGISTER_KERNEL(layer_norm,
   kernel->OutputAt(1).SetDataType(phi::DataType::UNDEFINED);
   kernel->OutputAt(2).SetDataType(phi::DataType::UNDEFINED);
 }
-#else // CUDA & MUSA
+#else  // CUDA & MUSA
 PD_REGISTER_KERNEL(layer_norm,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/logsumexp_function.cu.h b/paddle/phi/kernels/gpu/logsumexp_function.cu.h
index 5e23c3aab24ec..79eb0d2ceff01 100644
--- a/paddle/phi/kernels/gpu/logsumexp_function.cu.h
+++ b/paddle/phi/kernels/gpu/logsumexp_function.cu.h
@@ -46,7 +46,7 @@ __inline__ __device__ T WarpAllReduce(T val) {
   for (int mask = ThreadGroupWidth / 2; mask > 0; mask /= 2) {
 #if PADDLE_WITH_HIP
     val = Functor<T>()(val, __shfl_xor(0xffffffff, val, mask));
-#else // CUDA & MUSA
+#else  // CUDA & MUSA
     val = Functor<T>()(val, __shfl_xor_sync(0xffffffff, val, mask));
 #endif
   }
diff --git a/paddle/phi/kernels/gpu/reduce.h b/paddle/phi/kernels/gpu/reduce.h
index 428f105c9743a..e44f4e9fb3e28 100644
--- a/paddle/phi/kernels/gpu/reduce.h
+++ b/paddle/phi/kernels/gpu/reduce.h
@@ -15,8 +15,8 @@
 #pragma once
 
 // CUDA and HIP use same api
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
-    defined(PADDLE_WITH_XPU_KP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_XPU_KP)
 
 #include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/funcs/reduce_function.h"
diff --git a/paddle/phi/kernels/gpu/reduce_grad.h b/paddle/phi/kernels/gpu/reduce_grad.h
index c3c918c21cb35..347cfe316b53f 100644
--- a/paddle/phi/kernels/gpu/reduce_grad.h
+++ b/paddle/phi/kernels/gpu/reduce_grad.h
@@ -15,7 +15,8 @@
 #pragma once
 
 // CUDA and HIP use same api
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
 #include <algorithm>
 #include <cmath>
diff --git a/paddle/phi/kernels/gpu/rnn_functor.h b/paddle/phi/kernels/gpu/rnn_functor.h
index 3bf38383e9bda..f705fdc95a50c 100644
--- a/paddle/phi/kernels/gpu/rnn_functor.h
+++ b/paddle/phi/kernels/gpu/rnn_functor.h
@@ -25,11 +25,7 @@ namespace phi {
 using gpuRNNMode_t = miopenRNNMode_t;
 using gpuDnnHandle_t = miopenHandle_t;
 using gpuDnnDataType_t = miopenDataType_t;
-#elif defined(PADDLE_WITH_MUSA)
-using gpuRNNMode_t = mudnnRNNMode_t;
-using gpuDnnHandle_t = mudnnHandle_t;
-using gpuDnnDataType_t = mudnnDataType_t;
-#else
+#elif defined(PADDLE_WITH_CUDA)
 using gpuRNNMode_t = cudnnRNNMode_t;
 using gpuDnnHandle_t = cudnnHandle_t;
 using gpuDnnDataType_t = cudnnDataType_t;
@@ -107,7 +103,7 @@ class RNNDescriptors {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
           phi::dynload::miopenDropoutGetStatesSize(handle, &state_size));
-#elif defined(PADDLE_WITH_CUDA)
+#else
       PADDLE_ENFORCE_GPU_SUCCESS(
           phi::dynload::cudnnDropoutGetStatesSize(handle, &state_size));
 #endif
@@ -147,9 +143,8 @@ class RNNDescriptors {
         mode_,
         CUDNN_RNN_ALGO_STANDARD,
         cudnn_type));
-#elif defined(PADDLE_WITH_CUDA)
+#else
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetRNNDescriptor(
-#endif
         rnn_desc_.desc(),
         hidden_size_,
         num_layers_,
@@ -172,7 +167,7 @@ class RNNDescriptors {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenGetRNNParamsSize(
         handle, rnn_desc_.desc(), x_descs_[0], &weights_size_, cudnn_type));
-#elif defined(PADDLE_WITH_CUDA)
+#else
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnGetRNNParamsSize(
         handle, rnn_desc_.desc(), x_descs_[0], &weights_size_, cudnn_type));
 #endif
@@ -196,7 +191,7 @@ class RNNDescriptors {
                                                 workspace_size));
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenGetRNNTrainingReserveSize(
         handle, rnn_desc_.desc(), seq_length_, x_descs_.data(), reserve_size));
-#elif defined(PADDLE_WITH_CUDA)
+#else
     PADDLE_ENFORCE_GPU_SUCCESS(
         phi::dynload::cudnnGetRNNWorkspaceSize(handle,
                                                rnn_desc_.desc(),
@@ -217,7 +212,7 @@ class RNNDescriptors {
   miopenRNNDescriptor_t rnn_desc() { return rnn_desc_.desc(); }
   miopenDropoutDescriptor_t dropout_desc() { return dropout_desc_.desc(); }
   miopenTensorDescriptor_t weight_desc() { return weight_desc_.desc(); }
-#elif defined(PADDLE_WITH_CUDA)
+#else
   cudnnTensorDescriptor_t *x_descs() { return x_descs_.data(); }
   cudnnTensorDescriptor_t *y_descs() { return y_descs_.data(); }
 #if CUDNN_VERSION >= 7201
@@ -248,7 +243,7 @@ class RNNDescriptors {
 #ifdef PADDLE_WITH_HIP
   std::vector<miopenTensorDescriptor_t> x_descs_;
   std::vector<miopenTensorDescriptor_t> y_descs_;
-#elif defined(PADDLE_WITH_CUDA)
+#else
   std::vector<cudnnTensorDescriptor_t> x_descs_;
   std::vector<cudnnTensorDescriptor_t> y_descs_;
 #endif
diff --git a/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu b/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu
index 5152a133cf92b..2c8d6e1a58818 100644
--- a/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu
+++ b/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu
@@ -106,7 +106,7 @@ void GraphSendUERecvOpCUDAKernelLaunchHelper(const Context& ctx,
 #ifdef PADDLE_WITH_HIP
   int block_ = 256;
 #else
-  int block_ = 1024; // CUDA & MUSA
+  int block_ = 1024;  // CUDA & MUSA
 #endif
   if (reduce_op == "SUM" || reduce_op == "MEAN") {
     GraphSendUERecvSumCUDAFunctor<T> sum_functor;
diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
index 681af7a28d1d8..b9f1b245f69a9 100644
--- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
@@ -174,7 +174,7 @@ void ConvCudnnGradKernelImplV7(
     workspace_size = std::max(workspace_size, search1::GetWorkspaceSize(args1));
     bwd_result.algo = search1::Find<T>(
         args1, exhaustive_search, deterministic, workspace_size, ctx);
-#else // CUDA & MUSA
+#else  // CUDA & MUSA
     using search1 = SearchAlgorithm<ConvKind::kBackwardData>;
     bwd_result = search1::Find<T>(ctx, args1, exhaustive_search, deterministic);
     workspace_size = std::max(workspace_size, bwd_result.workspace_size);
@@ -200,7 +200,7 @@ void ConvCudnnGradKernelImplV7(
     workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2));
     filter_result.algo = search2::Find<T>(
         args2, exhaustive_search, deterministic, workspace_size, ctx);
-#else // CUDA & MUSA
+#else  // CUDA & MUSA
     using search2 = SearchAlgorithm<ConvKind::kBackwardFilter>;
     filter_result =
         search2::Find<T>(ctx, args2, exhaustive_search, deterministic);
@@ -215,7 +215,7 @@ void ConvCudnnGradKernelImplV7(
 #ifdef PADDLE_WITH_HIP
   // MIOPEN ONLY support beta to be 0.0f
   ScalingParamType<T> beta = 0.0f;
-#else // CUDA & MUSA
+#else  // CUDA & MUSA
   ScalingParamType<T> beta = use_addto ? 1.0f : 0.0f;
 
 #endif
@@ -280,7 +280,7 @@ void ConvCudnnGradKernelImplV7(
           },
           workspace_size);
     }
-#else // CUDA & MUSA
+#else  // CUDA & MUSA
     ConvRunner<T, ConvKind::kBackwardData>::Apply(ctx,
                                                   args1,
                                                   bwd_result,
@@ -320,7 +320,7 @@ void ConvCudnnGradKernelImplV7(
                   workspace_size));
         },
         workspace_size);
-#else // MUSA & CUDA
+#else  // MUSA & CUDA
     ConvRunner<T, ConvKind::kBackwardFilter>::Apply(ctx,
                                                     args2,
                                                     filter_result,
@@ -457,7 +457,7 @@ void ConvCudnnGradKernel(const Context& ctx,
 #ifdef PADDLE_WITH_HIP
   // HIP MIOPEN ONLY SUPPORT NCHW format
   auto compute_format = phi::backends::gpu::DataLayout::kNCHW;
-#else // MUSA & CUDA
+#else  // MUSA & CUDA
 #if CUDNN_VERSION_MIN(8, 1, 0)
   const bool compute_in_nhwc =
       (dtype == CUDNN_DATA_HALF || dtype == CUDNN_DATA_BFLOAT16) &&
@@ -1093,7 +1093,7 @@ void ConvCudnnGradGradKernel(
       workspace_size = search1::GetWorkspaceSize(args1);
       fwd_result1.algo = search1::Find<T>(
           args1, exhaustive_search, false, workspace_size, ctx);
-#else // CUDA & MUSA
+#else  // CUDA & MUSA
       using search1 = SearchAlgorithm<ConvKind::kForward>;
       fwd_result1 = search1::Find<T>(ctx, args1, exhaustive_search, false);
       workspace_size = search1::GetWorkspaceSize(args1, fwd_result1.algo);
@@ -1118,7 +1118,7 @@ void ConvCudnnGradGradKernel(
           std::max(workspace_size, search2::GetWorkspaceSize(args2));
       fwd_result2.algo = search2::Find<T>(
           args2, exhaustive_search, false, workspace_size, ctx);
-#else // CUDA & MUSA
+#else  // CUDA & MUSA
       using search2 = SearchAlgorithm<ConvKind::kForward>;
       fwd_result2 = search2::Find<T>(ctx, args2, exhaustive_search, false);
       workspace_size = std::max(
@@ -1144,7 +1144,7 @@ void ConvCudnnGradGradKernel(
     workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3));
     filter_result.algo = search3::Find<T>(
         args3, exhaustive_search, deterministic, workspace_size, ctx);
-#else // CUDA & MUSA
+#else  // CUDA & MUSA
     using search3 = SearchAlgorithm<ConvKind::kBackwardFilter>;
     filter_result =
         search3::Find<T>(ctx, args3, exhaustive_search, deterministic);
@@ -1171,7 +1171,7 @@ void ConvCudnnGradGradKernel(
     workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4));
     data_result.algo = search4::Find<T>(
         args4, exhaustive_search, deterministic, workspace_size, ctx);
-#else // CUDA & MUSA
+#else  // CUDA & MUSA
     using search4 = SearchAlgorithm<ConvKind::kBackwardData>;
     data_result =
         search4::Find<T>(ctx, args4, exhaustive_search, deterministic);
@@ -1228,7 +1228,7 @@ void ConvCudnnGradGradKernel(
                                                        workspace_size));
           },
           workspace_size);
-#else // MUSA & CUDA
+#else  // MUSA & CUDA
       ConvRunner<T, ConvKind::kForward>::Apply(ctx,
                                                args1,
                                                fwd_result1,
@@ -1347,7 +1347,7 @@ void ConvCudnnGradGradKernel(
                   workspace_size));
         },
         workspace_size);
-#else // CUDA & MUSA
+#else  // CUDA & MUSA
     ConvRunner<T, ConvKind::kBackwardData>::Apply(ctx,
                                                   args4,
                                                   data_result,
@@ -1542,7 +1542,7 @@ PD_REGISTER_KERNEL(depthwise_conv2d_double_grad,
                    double,
                    phi::dtype::float16,
                    phi::dtype::bfloat16) {}
-#else // CUDA & MUSA
+#else  // CUDA & MUSA
 PD_REGISTER_KERNEL(conv2d_grad,
                    GPUDNN,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpudnn/conv_kernel.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu
index a78b61690259e..99727e97360d6 100644
--- a/paddle/phi/kernels/gpudnn/conv_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_kernel.cu
@@ -86,7 +86,7 @@ void ConvCudnnKernelImplV7(const DenseTensor* transformed_input,
   // MIOPEN need to set groups in cdesc in miopen_desc.h
   args.cdesc.set(
       dtype, padding_common, strides, dilations, phi::AllowTF32Cudnn(), groups);
-#else // CUDA & MUSA
+#else  // CUDA & MUSA
   args.cdesc.set(
       dtype, padding_common, strides, dilations, phi::AllowTF32Cudnn());
 #endif
@@ -197,7 +197,7 @@ void ConvCudnnKernelImplV7(const DenseTensor* transformed_input,
                                                    workspace_size));
       },
       workspace_size);
-#else // CUDA & MUSA
+#else  // CUDA & MUSA
   ConvRunner<T, ConvKind::kForward>::Apply(ctx,
                                            args,
                                            fwd_result,
@@ -653,7 +653,7 @@ PD_REGISTER_KERNEL(conv3d,
                    double,
                    phi::dtype::float16,
                    phi::dtype::bfloat16) {}
-#else // CUDA & MUSA
+#else  // CUDA & MUSA
 PD_REGISTER_KERNEL(conv2d,
                    GPUDNN,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/group_norm_kernel.h b/paddle/phi/kernels/group_norm_kernel.h
index ec134fa47eecd..3c8fecf1e6115 100644
--- a/paddle/phi/kernels/group_norm_kernel.h
+++ b/paddle/phi/kernels/group_norm_kernel.h
@@ -33,7 +33,8 @@ void GroupNormKernel(const Context& dev_ctx,
                      DenseTensor* mean,
                      DenseTensor* variance);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 template <typename T, typename AccT = T>
 class GroupNormDirectCUDAFunctor {
  public:
diff --git a/paddle/phi/kernels/is_empty_kernel.cc b/paddle/phi/kernels/is_empty_kernel.cc
index f420a419f5c67..ccd4422e291f1 100644
--- a/paddle/phi/kernels/is_empty_kernel.cc
+++ b/paddle/phi/kernels/is_empty_kernel.cc
@@ -43,7 +43,8 @@ PD_REGISTER_KERNEL(is_empty,
   kernel->OutputAt(0).SetDataType(phi::DataType::BOOL);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(is_empty,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/reverse_kernel.cc b/paddle/phi/kernels/reverse_kernel.cc
index d8c8f5a966376..5c87833b1b80f 100644
--- a/paddle/phi/kernels/reverse_kernel.cc
+++ b/paddle/phi/kernels/reverse_kernel.cc
@@ -61,7 +61,8 @@ PD_REGISTER_KERNEL(reverse_array,
                    float,
                    double) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
 PD_REGISTER_KERNEL(reverse_array,
                    GPU,
diff --git a/paddle/phi/kernels/selected_rows/assign_kernel.cc b/paddle/phi/kernels/selected_rows/assign_kernel.cc
index 481f5f6fcf852..6cc76abf24467 100644
--- a/paddle/phi/kernels/selected_rows/assign_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/assign_kernel.cc
@@ -41,7 +41,8 @@ PD_REGISTER_KERNEL_FOR_ALL_DTYPE(assign_sr,
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL_FOR_ALL_DTYPE(assign_sr,
                                  GPU,
                                  ALL_LAYOUT,
diff --git a/paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.cc b/paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.cc
index 0ea7fbe8857c4..87ff4ef070b83 100644
--- a/paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.cc
@@ -84,7 +84,8 @@ PD_REGISTER_KERNEL(multiply_sr,
                    complex64,
                    complex128) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(multiply_raw_sr,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/selected_rows/scale_kernel.cc b/paddle/phi/kernels/selected_rows/scale_kernel.cc
index f6f9d587c4022..19d7d4d9ea4d2 100644
--- a/paddle/phi/kernels/selected_rows/scale_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/scale_kernel.cc
@@ -54,7 +54,8 @@ PD_REGISTER_KERNEL(scale_sr,
                    int,
                    int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(scale_sr,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/selected_rows/shape_kernel.cc b/paddle/phi/kernels/selected_rows/shape_kernel.cc
index 0a07bee7b6974..fe19262336a6c 100644
--- a/paddle/phi/kernels/selected_rows/shape_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/shape_kernel.cc
@@ -52,7 +52,8 @@ PD_REGISTER_KERNEL(shape_sr,
   kernel->OutputAt(0).SetDataType(phi::DataType::INT32);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(shape_sr,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/shape_kernel.cc b/paddle/phi/kernels/shape_kernel.cc
index e7556d1401954..0ebd114b04e5f 100644
--- a/paddle/phi/kernels/shape_kernel.cc
+++ b/paddle/phi/kernels/shape_kernel.cc
@@ -51,7 +51,8 @@ PD_REGISTER_KERNEL(shape,
   kernel->OutputAt(0).SetDataType(phi::DataType::INT32);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(shape,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/squeeze_grad_kernel.cc b/paddle/phi/kernels/squeeze_grad_kernel.cc
index 3eab4daf5740a..0130db9a7af37 100644
--- a/paddle/phi/kernels/squeeze_grad_kernel.cc
+++ b/paddle/phi/kernels/squeeze_grad_kernel.cc
@@ -49,7 +49,8 @@ PD_REGISTER_KERNEL(squeeze_grad,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(squeeze_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/strided_slice_grad_kernel.cc b/paddle/phi/kernels/strided_slice_grad_kernel.cc
index dd5bd42a3f48a..8a17d69b84901 100644
--- a/paddle/phi/kernels/strided_slice_grad_kernel.cc
+++ b/paddle/phi/kernels/strided_slice_grad_kernel.cc
@@ -55,7 +55,8 @@ PD_REGISTER_KERNEL(strided_slice_grad,
                    phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(strided_slice_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/strided_slice_kernel.cc b/paddle/phi/kernels/strided_slice_kernel.cc
index 79e43de25e9a8..e396f277a7e36 100644
--- a/paddle/phi/kernels/strided_slice_kernel.cc
+++ b/paddle/phi/kernels/strided_slice_kernel.cc
@@ -46,7 +46,8 @@ PD_REGISTER_KERNEL(strided_slice,
                    phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(strided_slice,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/strings/unicode.cc b/paddle/phi/kernels/strings/unicode.cc
index e39fcdc0181a6..c2293dd62937d 100644
--- a/paddle/phi/kernels/strings/unicode.cc
+++ b/paddle/phi/kernels/strings/unicode.cc
@@ -46,7 +46,8 @@ const uint16_t* GetCharcasesMap() {
   return reinterpret_cast<const uint16_t*>(utils_map[0]);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
 const uint8_t* GetGPUUniflagMap() {
   if (utils_map[3] == nullptr) {

From 2a7c36515437e049101dec9821aa49477692f2b0 Mon Sep 17 00:00:00 2001
From: CaiZhi <zhi.cai@mthreads.com>
Date: Tue, 15 Aug 2023 20:26:48 +0800
Subject: [PATCH 51/55] [MTAI-484] fix(build): modify code format for cpplint
 check

---
 paddle/fluid/platform/CMakeLists.txt          | 48 +++++------
 paddle/fluid/platform/device/CMakeLists.txt   |  4 +-
 .../fluid/platform/device/gpu/CMakeLists.txt  | 10 ++-
 paddle/fluid/platform/dynload/CMakeLists.txt  |  5 +-
 paddle/fluid/platform/dynload/mublas.h        | 60 +++++++-------
 paddle/fluid/platform/dynload/musa_driver.h   |  1 -
 paddle/phi/backends/dynload/mublas.h          | 61 +++++++-------
 paddle/phi/backends/dynload/musa_driver.cc    |  1 -
 paddle/phi/backends/dynload/musartc.h         | 24 +++---
 paddle/phi/backends/gpu/forwards.h            |  6 +-
 paddle/phi/backends/gpu/gpu_primitives.h      |  3 +-
 .../backends/gpu/musa/musa_device_function.h  | 11 ++-
 paddle/phi/backends/gpu/musa/musa_helper.h    | 11 ++-
 paddle/phi/backends/gpu/musa/musa_info.cc     | 23 +++---
 paddle/phi/kernels/funcs/CMakeLists.txt       | 10 +--
 paddle/phi/kernels/funcs/activation_functor.h |  3 +-
 paddle/phi/kernels/funcs/algorithm.h          |  6 +-
 paddle/phi/kernels/funcs/blas/blas_impl.mu.h  | 82 ++++++-------------
 paddle/phi/kernels/funcs/broadcast_function.h |  6 +-
 paddle/phi/kernels/funcs/elementwise_base.h   |  6 +-
 .../kernels/funcs/gather_scatter_functor.h    |  2 +-
 paddle/phi/kernels/funcs/index_calculator.h   |  3 +-
 paddle/phi/kernels/funcs/math_function.cc     |  4 +-
 paddle/phi/kernels/funcs/reduce_function.h    |  6 +-
 paddle/phi/kernels/gpu/activation_kernel.cu   |  2 +-
 .../phi/kernels/gpu/layer_norm_grad_kernel.cu |  2 +-
 paddle/phi/kernels/gpu/rnn_kernel.cu.cc       |  2 +-
 paddle/phi/kernels/gpu/top_k_kernel.cu        |  4 +-
 .../gpudnn/conv_transpose_grad_kernel.cu      | 24 +++---
 .../kernels/gpudnn/conv_transpose_kernel.cu   |  4 +-
 paddle/phi/kernels/gpudnn/pool_grad_kernel.cu |  2 +-
 paddle/phi/kernels/gpudnn/pool_kernel.cu      |  2 +-
 .../phi/kernels/gpudnn/softmax_grad_kernel.cu |  2 +-
 33 files changed, 210 insertions(+), 230 deletions(-)

diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 31f25f9d628b7..8b7c77d720fed 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -64,7 +64,9 @@ if(WITH_DGC)
   set(dgc_deps dgc)
 endif()
 
-if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
+if(WITH_GPU
+   OR WITH_ROCM
+   OR WITH_MUSA)
   set(GPU_CTX_DEPS dynload_cuda dynamic_loader)
 endif()
 
@@ -91,14 +93,13 @@ if(WITH_ROCM)
     DEPS simple_threadpool enforce)
 endif()
 if(WITH_MUSA)
-  musa_library(
-    stream_callback_manager
-    SRCS stream_callback_manager.cc
-    DEPS simple_threadpool enforce)
+  musa_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS
+               simple_threadpool enforce)
 endif()
 
-
-if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
+if(WITH_GPU
+   OR WITH_ROCM
+   OR WITH_MUSA)
   set(STREAM_CALLBACK_DEPS stream_callback_manager)
 else()
   set(STREAM_CALLBACK_DEPS)
@@ -144,7 +145,9 @@ cc_library(
   SRCS collective_helper.cc gen_comm_id_helper.cc
   DEPS framework_proto device_context enforce)
 
-if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
+if(WITH_GPU
+   OR WITH_ROCM
+   OR WITH_MUSA)
   target_link_libraries(device_context gpu_resource_pool)
 endif()
 
@@ -243,10 +246,7 @@ if(WITH_ROCM)
 endif()
 
 if(WITH_MUSA)
-  musa_library(
-    device_event_gpu
-    SRCS device_event_gpu.cc
-    DEPS device_event_base)
+  musa_library(device_event_gpu SRCS device_event_gpu.cc DEPS device_event_base)
   set(DEVICE_EVENT_LIBS
       device_event_gpu
       CACHE INTERNAL "device event libs")
@@ -301,14 +301,17 @@ elseif(WITH_ROCM)
 elseif(WITH_MUSA)
   musa_library(
     profiler
-    SRCS profiler.cc profiler.cu
-    DEPS phi
-         gpu_info
-         enforce
-         new_profiler
-         stats
-         op_proto_maker
-         shape_inference)
+    SRCS
+    profiler.cc
+    profiler.cu
+    DEPS
+    phi
+    gpu_info
+    enforce
+    new_profiler
+    stats
+    op_proto_maker
+    shape_inference)
 elseif(WITH_XPU)
   cc_library(
     profiler
@@ -368,10 +371,7 @@ if(WITH_GPU)
 endif()
 
 if(WITH_MUSA)
-  musa_library(
-    cuda_device_guard
-    SRCS cuda_device_guard.cc
-    DEPS gpu_info)
+  musa_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info)
 endif()
 
 if(WITH_ROCM)
diff --git a/paddle/fluid/platform/device/CMakeLists.txt b/paddle/fluid/platform/device/CMakeLists.txt
index b782a45047117..10f7143028225 100644
--- a/paddle/fluid/platform/device/CMakeLists.txt
+++ b/paddle/fluid/platform/device/CMakeLists.txt
@@ -1,7 +1,9 @@
 set(DEV_LIBS custom_device)
 
 # GPU
-if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
+if(WITH_GPU
+   OR WITH_ROCM
+   OR WITH_MUSA)
   add_subdirectory(gpu)
 endif()
 
diff --git a/paddle/fluid/platform/device/gpu/CMakeLists.txt b/paddle/fluid/platform/device/gpu/CMakeLists.txt
index 8b6b1ee1cda23..85a86ae8ecedd 100644
--- a/paddle/fluid/platform/device/gpu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/gpu/CMakeLists.txt
@@ -25,8 +25,14 @@ elseif(WITH_ROCM)
 elseif(WITH_MUSA)
   musa_library(
     gpu_info
-    SRCS gpu_info.cc
-    DEPS phi glog enforce monitor dynload_cuda)
+    SRCS
+    gpu_info.cc
+    DEPS
+    phi
+    glog
+    enforce
+    monitor
+    dynload_cuda)
 endif()
 
 cc_library(
diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt
index d3d61e2179abf..95a488e3b9dba 100644
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -73,10 +73,7 @@ if(WITH_ROCM)
     SRCS warpctc.cc
     DEPS dynamic_loader warpctc phi)
 elseif(WITH_MUSA)
-  musa_library(
-    dynload_cuda
-    SRCS ${MUSA_SRCS}
-    DEPS dynamic_loader phi)
+  musa_library(dynload_cuda SRCS ${MUSA_SRCS} DEPS dynamic_loader phi)
   cc_library(
     dynload_warpctc
     SRCS warpctc.cc
diff --git a/paddle/fluid/platform/dynload/mublas.h b/paddle/fluid/platform/dynload/mublas.h
index a2d29535f6615..d958d9ac7c9b6 100644
--- a/paddle/fluid/platform/dynload/mublas.h
+++ b/paddle/fluid/platform/dynload/mublas.h
@@ -38,36 +38,36 @@ namespace dynload {
   extern DynLoad__##__name __name
 
 #define MUBLAS_BLAS_ROUTINE_EACH(__macro) \
-  __macro(mublasSaxpy);                \
-  __macro(mublasDaxpy);                \
-  __macro(mublasCaxpy);                \
-  __macro(mublasZaxpy);                \
-  __macro(mublasSscal);                \
-  __macro(mublasDscal);                \
-  __macro(mublasScopy);                \
-  __macro(mublasDcopy);                \
-  __macro(mublasSgemv);                \
-  __macro(mublasDgemv);                \
-  __macro(mublasCgemv);                \
-  __macro(mublasZgemv);                \
-  __macro(mublasSgemm);                \
-  __macro(mublasDgemm);                \
-  __macro(mublasCgemm);                \
-  __macro(mublasZgemm);                \
-  __macro(mublasHgemm);                \
-  __macro(mublasSgeam);                \
-  __macro(mublasDgeam);                \
-  __macro(mublasDtrsm);                \
-  __macro(mublasCtrsm);                \
-  __macro(mublasZtrsm);                \
-  __macro(mublasCreate);               \
-  __macro(mublasDestroy);              \
-  __macro(mublasSetStream);            \
-  __macro(mublasSetPointerMode);       \
-  __macro(mublasGetPointerMode);       \
-  __macro(mublasSgemmBatched);         \
-  __macro(mublasDgemmBatched);         \
-  __macro(mublasCgemmBatched);         \
+  __macro(mublasSaxpy);                   \
+  __macro(mublasDaxpy);                   \
+  __macro(mublasCaxpy);                   \
+  __macro(mublasZaxpy);                   \
+  __macro(mublasSscal);                   \
+  __macro(mublasDscal);                   \
+  __macro(mublasScopy);                   \
+  __macro(mublasDcopy);                   \
+  __macro(mublasSgemv);                   \
+  __macro(mublasDgemv);                   \
+  __macro(mublasCgemv);                   \
+  __macro(mublasZgemv);                   \
+  __macro(mublasSgemm);                   \
+  __macro(mublasDgemm);                   \
+  __macro(mublasCgemm);                   \
+  __macro(mublasZgemm);                   \
+  __macro(mublasHgemm);                   \
+  __macro(mublasSgeam);                   \
+  __macro(mublasDgeam);                   \
+  __macro(mublasDtrsm);                   \
+  __macro(mublasCtrsm);                   \
+  __macro(mublasZtrsm);                   \
+  __macro(mublasCreate);                  \
+  __macro(mublasDestroy);                 \
+  __macro(mublasSetStream);               \
+  __macro(mublasSetPointerMode);          \
+  __macro(mublasGetPointerMode);          \
+  __macro(mublasSgemmBatched);            \
+  __macro(mublasDgemmBatched);            \
+  __macro(mublasCgemmBatched);            \
   __macro(mublasZgemmBatched);
 
 MUBLAS_BLAS_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP)
diff --git a/paddle/fluid/platform/dynload/musa_driver.h b/paddle/fluid/platform/dynload/musa_driver.h
index 546a472e036b2..261841e8e7384 100644
--- a/paddle/fluid/platform/dynload/musa_driver.h
+++ b/paddle/fluid/platform/dynload/musa_driver.h
@@ -56,4 +56,3 @@ PLATFORM_MUSA_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MUSA_WRAP);
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
-
diff --git a/paddle/phi/backends/dynload/mublas.h b/paddle/phi/backends/dynload/mublas.h
index 5782498ed951f..3b91a703f5775 100644
--- a/paddle/phi/backends/dynload/mublas.h
+++ b/paddle/phi/backends/dynload/mublas.h
@@ -52,39 +52,38 @@ extern void *mublas_dso_handle;
   extern DynLoad__##__name __name
 
 #define MUBLAS_BLAS_ROUTINE_EACH(__macro) \
-  __macro(mublasSaxpy);                \
-  __macro(mublasDaxpy);                \
-  __macro(mublasCaxpy);                \
-  __macro(mublasZaxpy);                \
-  __macro(mublasSscal);                \
-  __macro(mublasDscal);                \
-  __macro(mublasScopy);                \
-  __macro(mublasDcopy);                \
-  __macro(mublasSgemv);                \
-  __macro(mublasDgemv);                \
-  __macro(mublasCgemv);                \
-  __macro(mublasZgemv);                \
-  __macro(mublasSgemm);                \
-  __macro(mublasDgemm);                \
-  __macro(mublasCgemm);                \
-  __macro(mublasZgemm);                \
-  __macro(mublasHgemm);                \
-  __macro(mublasSgeam);                \
-  __macro(mublasDgeam);                \
-  __macro(mublasDtrsm);                \
-  __macro(mublasCtrsm);                \
-  __macro(mublasZtrsm);                \
-  __macro(mublasCreate);               \
-  __macro(mublasDestroy);              \
-  __macro(mublasSetStream);            \
-  __macro(mublasSetPointerMode);       \
-  __macro(mublasGetPointerMode);       \
-  __macro(mublasSgemmBatched);         \
-  __macro(mublasDgemmBatched);         \
-  __macro(mublasCgemmBatched);         \
+  __macro(mublasSaxpy);                   \
+  __macro(mublasDaxpy);                   \
+  __macro(mublasCaxpy);                   \
+  __macro(mublasZaxpy);                   \
+  __macro(mublasSscal);                   \
+  __macro(mublasDscal);                   \
+  __macro(mublasScopy);                   \
+  __macro(mublasDcopy);                   \
+  __macro(mublasSgemv);                   \
+  __macro(mublasDgemv);                   \
+  __macro(mublasCgemv);                   \
+  __macro(mublasZgemv);                   \
+  __macro(mublasSgemm);                   \
+  __macro(mublasDgemm);                   \
+  __macro(mublasCgemm);                   \
+  __macro(mublasZgemm);                   \
+  __macro(mublasHgemm);                   \
+  __macro(mublasSgeam);                   \
+  __macro(mublasDgeam);                   \
+  __macro(mublasDtrsm);                   \
+  __macro(mublasCtrsm);                   \
+  __macro(mublasZtrsm);                   \
+  __macro(mublasCreate);                  \
+  __macro(mublasDestroy);                 \
+  __macro(mublasSetStream);               \
+  __macro(mublasSetPointerMode);          \
+  __macro(mublasGetPointerMode);          \
+  __macro(mublasSgemmBatched);            \
+  __macro(mublasDgemmBatched);            \
+  __macro(mublasCgemmBatched);            \
   __macro(mublasZgemmBatched);
 
-
 MUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP)
 
 #undef DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP
diff --git a/paddle/phi/backends/dynload/musa_driver.cc b/paddle/phi/backends/dynload/musa_driver.cc
index a7690ac72bbde..2173a8d6cdd81 100644
--- a/paddle/phi/backends/dynload/musa_driver.cc
+++ b/paddle/phi/backends/dynload/musa_driver.cc
@@ -31,4 +31,3 @@ bool HasCUDADriver() {
 
 }  // namespace dynload
 }  // namespace phi
-
diff --git a/paddle/phi/backends/dynload/musartc.h b/paddle/phi/backends/dynload/musartc.h
index 7362fa61fc95c..317621090a5b3 100644
--- a/paddle/phi/backends/dynload/musartc.h
+++ b/paddle/phi/backends/dynload/musartc.h
@@ -28,18 +28,18 @@ extern std::once_flag musartc_dso_flag;
 extern void* musartc_dso_handle;
 extern bool HasNVRTC();
 
-#define DECLARE_DYNAMIC_LOAD_NVRTC_WRAP(__name)                        \
-  struct DynLoad__##__name {                                           \
-    template <typename... Args>                                        \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {   \
-      using musartc_func = decltype(&::__name);                        \
-      std::call_once(musartc_dso_flag, []() {                          \
-        musartc_dso_handle = phi::dynload::GetNVRTCDsoHandle();        \
-      });                                                              \
-      static void* p_##__name = dlsym(musartc_dso_handle, #__name);    \
-      return reinterpret_cast<musartc_func>(p_##__name)(args...);      \
-    }                                                                  \
-  };                                                                   \
+#define DECLARE_DYNAMIC_LOAD_NVRTC_WRAP(__name)                      \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+      using musartc_func = decltype(&::__name);                      \
+      std::call_once(musartc_dso_flag, []() {                        \
+        musartc_dso_handle = phi::dynload::GetNVRTCDsoHandle();      \
+      });                                                            \
+      static void* p_##__name = dlsym(musartc_dso_handle, #__name);  \
+      return reinterpret_cast<musartc_func>(p_##__name)(args...);    \
+    }                                                                \
+  };                                                                 \
   extern struct DynLoad__##__name __name
 
 /**
diff --git a/paddle/phi/backends/gpu/forwards.h b/paddle/phi/backends/gpu/forwards.h
index e84e000ba5e66..4437f9c315ff0 100644
--- a/paddle/phi/backends/gpu/forwards.h
+++ b/paddle/phi/backends/gpu/forwards.h
@@ -75,9 +75,9 @@ using ncclComm_t = struct ncclComm *;
 // Forward declaration of MUSA runtime types.
 using musaStream_t = struct MUstream_st *;
 using musaEvent_t = struct MUevent_st *;
-using mublasHandle_t = struct _mublasHandle_t*;
-using mudnnHandle_t = class Handle*;
-using musparseHandle_t = struct _musparse_handle*;
+using mublasHandle_t = struct _mublasHandle_t *;
+using mudnnHandle_t = class Handle *;
+using musparseHandle_t = struct _musparse_handle *;
 
 /// Forward declaration of ROCM types.
 #include <cstddef>
diff --git a/paddle/phi/backends/gpu/gpu_primitives.h b/paddle/phi/backends/gpu/gpu_primitives.h
index a4d75eec012da..d46ada073c47d 100644
--- a/paddle/phi/backends/gpu/gpu_primitives.h
+++ b/paddle/phi/backends/gpu/gpu_primitives.h
@@ -61,7 +61,8 @@ CUDA_ATOMIC_WRAPPER(Add, int64_t) {
       static_cast<unsigned long long int>(val));            // NOLINT
 }
 
-#if defined(__HIPCC__) || defined(__MUSACC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600)
+#if defined(__HIPCC__) || defined(__MUSACC__) || \
+    (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600)
 USE_CUDA_ATOMIC(Add, double);
 #else
 CUDA_ATOMIC_WRAPPER(Add, double) {
diff --git a/paddle/phi/backends/gpu/musa/musa_device_function.h b/paddle/phi/backends/gpu/musa/musa_device_function.h
index 5745af0212e3a..074bb2ba0cbff 100644
--- a/paddle/phi/backends/gpu/musa/musa_device_function.h
+++ b/paddle/phi/backends/gpu/musa/musa_device_function.h
@@ -102,11 +102,11 @@ __forceinline__ __device__ phi::dtype::complex<double> CudaShuffleDownSync(
 }
 
 // TODO(@MTAI): there is compiling error when compiling the following code
-//template <>
-//__forceinline__ __device__ phi::dtype::float16 CudaShuffleXorSync(
-//    unsigned mask, phi::dtype::float16 val, int width) {
-//  return phi::dtype::float16(__shfl_xor_sync(mask, val.to_half(), width));
-//}
+// template <>
+// __forceinline__ __device__ phi::dtype::float16 CudaShuffleXorSync(
+//     unsigned mask, phi::dtype::float16 val, int width) {
+//   return phi::dtype::float16(__shfl_xor_sync(mask, val.to_half(), width));
+// }
 
 template <>
 __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleXorSync(
@@ -187,4 +187,3 @@ __device__ T reduceSum(T val, int tid, int len) {
 }  // namespace gpu
 }  // namespace backends
 }  // namespace phi
-
diff --git a/paddle/phi/backends/gpu/musa/musa_helper.h b/paddle/phi/backends/gpu/musa/musa_helper.h
index 57135ac49d905..cbfc458abf8da 100644
--- a/paddle/phi/backends/gpu/musa/musa_helper.h
+++ b/paddle/phi/backends/gpu/musa/musa_helper.h
@@ -21,14 +21,13 @@ namespace gpu {
 #define CUDNN_VERSION_MIN(major, minor, patch) \
   (0 >= ((major)*1000 + (minor)*100 + (patch)))
 
-#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)                           \
-  int64_t __index__ =                                                       \
-      static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;          \
-  int64_t __stride__ = static_cast<int64_t>(blockDim.x) * gridDim.x;        \
-  for (index_type i = __index__; __index__ < (num);                         \
+#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)                    \
+  int64_t __index__ =                                                \
+      static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;   \
+  int64_t __stride__ = static_cast<int64_t>(blockDim.x) * gridDim.x; \
+  for (index_type i = __index__; __index__ < (num);                  \
        __index__ += __stride__, i = __index__)
 
 }  // namespace gpu
 }  // namespace backends
 }  // namespace phi
-
diff --git a/paddle/phi/backends/gpu/musa/musa_info.cc b/paddle/phi/backends/gpu/musa/musa_info.cc
index a7f2f8dbb166d..f244601b9d9cc 100644
--- a/paddle/phi/backends/gpu/musa/musa_info.cc
+++ b/paddle/phi/backends/gpu/musa/musa_info.cc
@@ -15,8 +15,8 @@
 #include <array>
 #include <mutex>
 
-#include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/fluid/framework/fleet/heter_ps/log_patch.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
 
 #include "paddle/phi/core/enforce.h"
 
@@ -92,10 +92,10 @@ int GetGPUComputeCapability(int id) {
                                    id,
                                    GetGPUDeviceCount()));
   int major, minor;
-  auto major_error_code = musaDeviceGetAttribute(
-      &major, musaDevAttrComputeCapabilityMajor, id);
-  auto minor_error_code = musaDeviceGetAttribute(
-      &minor, musaDevAttrComputeCapabilityMinor, id);
+  auto major_error_code =
+      musaDeviceGetAttribute(&major, musaDevAttrComputeCapabilityMajor, id);
+  auto minor_error_code =
+      musaDeviceGetAttribute(&minor, musaDevAttrComputeCapabilityMinor, id);
 
   PADDLE_ENFORCE_GPU_SUCCESS(major_error_code);
   PADDLE_ENFORCE_GPU_SUCCESS(minor_error_code);
@@ -140,7 +140,6 @@ int GetGPUMultiProcessors(int id) {
                                    GetGPUDeviceCount()));
   int count;
   PADDLE_ENFORCE_GPU_SUCCESS(
-                                     
       musaDeviceGetAttribute(&count, musaDevAttrMultiProcessorCount, id));
   return count;
 }
@@ -190,18 +189,15 @@ std::array<int, 3> GetGpuMaxGridDimSize(int id) {
                                    GetGPUDeviceCount()));
   std::array<int, 3> ret;
   int size;
-  auto error_code_x =
-      musaDeviceGetAttribute(&size, musaDevAttrMaxGridDimX, id);
+  auto error_code_x = musaDeviceGetAttribute(&size, musaDevAttrMaxGridDimX, id);
   PADDLE_ENFORCE_GPU_SUCCESS(error_code_x);
   ret[0] = size;
 
-  auto error_code_y =
-      musaDeviceGetAttribute(&size, musaDevAttrMaxGridDimY, id);
+  auto error_code_y = musaDeviceGetAttribute(&size, musaDevAttrMaxGridDimY, id);
   PADDLE_ENFORCE_GPU_SUCCESS(error_code_y);
   ret[1] = size;
 
-  auto error_code_z =
-      musaDeviceGetAttribute(&size, musaDevAttrMaxGridDimZ, id);
+  auto error_code_z = musaDeviceGetAttribute(&size, musaDevAttrMaxGridDimZ, id);
   PADDLE_ENFORCE_GPU_SUCCESS(error_code_z);
   ret[2] = size;
   return ret;
@@ -241,7 +237,8 @@ const gpuDeviceProp &GetDeviceProperties(int id) {
   }
 
   std::call_once(*(g_device_props_init_flags[id]), [&] {
-    PADDLE_ENFORCE_GPU_SUCCESS(musaGetDeviceProperties(&g_device_props[id], id));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        musaGetDeviceProperties(&g_device_props[id], id));
   });
 
   return g_device_props[id];
diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt
index cffb97d84050c..3a2b1f276bbbb 100644
--- a/paddle/phi/kernels/funcs/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/CMakeLists.txt
@@ -8,7 +8,9 @@ file(
   GLOB func_cc_srcs
   RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
   "*.cc")
-if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
+if(WITH_GPU
+   OR WITH_ROCM
+   OR WITH_MUSA)
   file(
     GLOB func_cu_srcs
     RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
@@ -18,10 +20,8 @@ endif()
 # TODO(@MTAI): compilation error will occur when compiling the following files.
 # Compiler mcc need fix this bug.
 if(WITH_MUSA)
-  list(REMOVE_ITEM func_cu_srcs
-      "cross_entropy.cu"
-      "gru_compute.cu"
-      "softmax.cu")
+  list(REMOVE_ITEM func_cu_srcs "cross_entropy.cu" "gru_compute.cu"
+       "softmax.cu")
 endif()
 
 collect_srcs(kernels_srcs SRCS ${func_cc_srcs} ${func_cu_srcs})
diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h
index ef13b248f4c90..a43300056161b 100644
--- a/paddle/phi/kernels/funcs/activation_functor.h
+++ b/paddle/phi/kernels/funcs/activation_functor.h
@@ -2566,7 +2566,8 @@ struct SquareGradGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || defined(__xpu__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || \
+    defined(__xpu__)
 
 template <typename T>
 struct CudaLogitFunctor : public BaseActivationFunctor<T> {
diff --git a/paddle/phi/kernels/funcs/algorithm.h b/paddle/phi/kernels/funcs/algorithm.h
index 4c4bf031b4338..49daa32412674 100644
--- a/paddle/phi/kernels/funcs/algorithm.h
+++ b/paddle/phi/kernels/funcs/algorithm.h
@@ -40,7 +40,8 @@ HOSTDEVICE inline int64_t BinarySearch(const T *x, int64_t num, const T &val) {
 
 template <typename T1, typename T2>
 HOSTDEVICE inline size_t LowerBound(const T1 *x, size_t num, const T2 &val) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__)  // @{ Group LowerBound
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__) || \
+    defined(__MUSACC__)  // @{ Group LowerBound
   // The following code is from
   // https://en.cppreference.com/w/cpp/algorithm/lower_bound
   auto *first = x;
@@ -63,7 +64,8 @@ HOSTDEVICE inline size_t LowerBound(const T1 *x, size_t num, const T2 &val) {
 
 template <typename T1, typename T2>
 HOSTDEVICE inline size_t UpperBound(const T1 *x, size_t num, const T2 &val) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__)  // @{ Group UpperBound
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__) || \
+    defined(__MUSACC__)  // @{ Group UpperBound
   // The following code is from
   // https://en.cppreference.com/w/cpp/algorithm/upper_bound
   auto *first = x;
diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.mu.h b/paddle/phi/kernels/funcs/blas/blas_impl.mu.h
index cd644597df477..44098d9d090c6 100644
--- a/paddle/phi/kernels/funcs/blas/blas_impl.mu.h
+++ b/paddle/phi/kernels/funcs/blas/blas_impl.mu.h
@@ -30,7 +30,6 @@ PHI_DECLARE_bool(gemm_use_half_precision_compute_type);
 namespace phi {
 namespace funcs {
 
-
 template <>
 template <typename T>
 void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
@@ -42,8 +41,7 @@ void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
                                  const T *A,
                                  const T *B,
                                  T beta,
-                                 T *C) const {
-}
+                                 T *C) const {}
 
 template <>
 template <>
@@ -56,8 +54,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
                                         const phi::dtype::float16 *A,
                                         const phi::dtype::float16 *B,
                                         phi::dtype::float16 beta,
-                                        phi::dtype::float16 *C) const {
-}
+                                        phi::dtype::float16 *C) const {}
 
 template <>
 template <>
@@ -70,8 +67,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
                                         const phi::dtype::bfloat16 *A,
                                         const phi::dtype::bfloat16 *B,
                                         phi::dtype::bfloat16 beta,
-                                        phi::dtype::bfloat16 *C) const {
-}
+                                        phi::dtype::bfloat16 *C) const {}
 
 template <>
 template <>
@@ -84,8 +80,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
                                         const phi::dtype::complex<float> *A,
                                         const phi::dtype::complex<float> *B,
                                         phi::dtype::complex<float> beta,
-                                        phi::dtype::complex<float> *C) const {
-}
+                                        phi::dtype::complex<float> *C) const {}
 
 template <>
 template <>
@@ -98,8 +93,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
                                         const phi::dtype::complex<double> *A,
                                         const phi::dtype::complex<double> *B,
                                         phi::dtype::complex<double> beta,
-                                        phi::dtype::complex<double> *C) const {
-}
+                                        phi::dtype::complex<double> *C) const {}
 
 template <>
 template <typename T>
@@ -115,8 +109,7 @@ void Blas<phi::GPUContext>::GEMM(bool transA,
                                  int ldb,
                                  T beta,
                                  T *C,
-                                 int ldc) const {
-}
+                                 int ldc) const {}
 
 template <>
 template <>
@@ -132,8 +125,7 @@ inline void Blas<phi::GPUContext>::GEMM(bool transA,
                                         int ldb,
                                         phi::dtype::float16 beta,
                                         phi::dtype::float16 *C,
-                                        int ldc) const {
-}
+                                        int ldc) const {}
 
 template <>
 template <>
@@ -149,23 +141,19 @@ inline void Blas<phi::GPUContext>::GEMM(bool transA,
                                         int ldb,
                                         phi::dtype::bfloat16 beta,
                                         phi::dtype::bfloat16 *C,
-                                        int ldc) const {
-}
+                                        int ldc) const {}
 
 template <>
 template <typename T>
-void Blas<phi::GPUContext>::AXPY(int n, T alpha, const T *x, T *y) const {
-}
+void Blas<phi::GPUContext>::AXPY(int n, T alpha, const T *x, T *y) const {}
 
 template <>
 template <typename T>
-void Blas<phi::GPUContext>::SCAL(int n, const T alpha, T *x) const {
-}
+void Blas<phi::GPUContext>::SCAL(int n, const T alpha, T *x) const {}
 
 template <>
 template <typename T>
-void Blas<phi::GPUContext>::VCOPY(int n, const T *x, T *y) const {
-}
+void Blas<phi::GPUContext>::VCOPY(int n, const T *x, T *y) const {}
 
 template <>
 template <typename T>
@@ -176,8 +164,7 @@ void Blas<phi::GPUContext>::GEMV(bool trans_a,
                                  const T *A,
                                  const T *B,
                                  T beta,
-                                 T *C) const {
-}
+                                 T *C) const {}
 
 template <>
 template <>
@@ -188,8 +175,7 @@ inline void Blas<phi::GPUContext>::GEMV(bool trans_a,
                                         const phi::dtype::float16 *A,
                                         const phi::dtype::float16 *B,
                                         phi::dtype::float16 beta,
-                                        phi::dtype::float16 *C) const {
-}
+                                        phi::dtype::float16 *C) const {}
 
 template <>
 template <>
@@ -200,8 +186,7 @@ inline void Blas<phi::GPUContext>::GEMV(bool trans_a,
                                         const phi::dtype::bfloat16 *A,
                                         const phi::dtype::bfloat16 *B,
                                         phi::dtype::bfloat16 beta,
-                                        phi::dtype::bfloat16 *C) const {
-}
+                                        phi::dtype::bfloat16 *C) const {}
 
 template <>
 template <typename T>
@@ -217,8 +202,7 @@ void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
                                         T *C,
                                         int batchCount,
                                         int64_t strideA,
-                                        int64_t strideB) const {
-}
+                                        int64_t strideB) const {}
 
 template <>
 template <>
@@ -234,8 +218,7 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
                                                phi::dtype::bfloat16 *C,
                                                int batchCount,
                                                int64_t strideA,
-                                               int64_t strideB) const {
-}
+                                               int64_t strideB) const {}
 
 template <>
 template <typename T>
@@ -249,8 +232,7 @@ void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
                                         const T **B,
                                         T beta,
                                         T **C,
-                                        int batchCount) const {
-}
+                                        int batchCount) const {}
 
 #if defined(__MUSACC__)
 template <>
@@ -265,8 +247,7 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
                                                const double **B,
                                                double beta,
                                                double **C,
-                                               int batchCount) const {
-}
+                                               int batchCount) const {}
 
 template <>
 template <>
@@ -280,8 +261,7 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
                                                const float **B,
                                                float beta,
                                                float **C,
-                                               int batchCount) const {
-}
+                                               int batchCount) const {}
 
 template <>
 template <>
@@ -295,8 +275,7 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
                                                const phi::dtype::float16 **B,
                                                phi::dtype::float16 beta,
                                                phi::dtype::float16 **C,
-                                               int batchCount) const {
-}
+                                               int batchCount) const {}
 
 template <>
 template <>
@@ -310,8 +289,7 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
                                                const phi::dtype::bfloat16 **B,
                                                phi::dtype::bfloat16 beta,
                                                phi::dtype::bfloat16 **C,
-                                               int batchCount) const {
-}
+                                               int batchCount) const {}
 #endif
 
 template <>
@@ -326,14 +304,12 @@ void Blas<phi::GPUContext>::TRSM(CBLAS_SIDE side,
                                  const T *A,
                                  int lda,
                                  T *B,
-                                 int ldb) const {
-}
+                                 int ldb) const {}
 
 template <>
 template <typename T>
 void Blas<phi::GPUContext>::BatchedGETRF(
-    int n, T **a, int *ipiv, int *info, int batch_size) const {
-}
+    int n, T **a, int *ipiv, int *info, int batch_size) const {}
 
 template <>
 template <typename T>
@@ -342,14 +318,12 @@ void Blas<phi::GPUContext>::BatchedGETRI(int n,
                                          const int *ipiv,
                                          T **a_inv,
                                          int *info,
-                                         int batch_size) const {
-}
+                                         int batch_size) const {}
 
 template <>
 template <typename T>
 void Blas<phi::GPUContext>::BatchedMatInv(
-    int n, const T **a, T **a_inv, int *info, int batch_size) const {
-}
+    int n, const T **a, T **a_inv, int *info, int batch_size) const {}
 
 template <>
 template <typename T>
@@ -362,8 +336,7 @@ void Blas<phi::GPUContext>::BatchedGETRS(CBLAS_TRANSPOSE trans,
                                          T **b,
                                          int ldb,
                                          int *info,
-                                         int batch_size) const {
-}
+                                         int batch_size) const {}
 
 template <>
 template <typename T>
@@ -378,8 +351,7 @@ void Blas<phi::GPUContext>::BatchedTRSM(CBLAS_SIDE side,
                                         int lda,
                                         T **B,
                                         int ldb,
-                                        int batch_size) const {
-}
+                                        int batch_size) const {}
 
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h
index 5f19522d28f18..b1732b44373c7 100644
--- a/paddle/phi/kernels/funcs/broadcast_function.h
+++ b/paddle/phi/kernels/funcs/broadcast_function.h
@@ -17,7 +17,8 @@ limitations under the License. */
 #include <sstream>
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || defined(__xpu__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || \
+    defined(__xpu__)
 #include "paddle/phi/kernels/funcs/dims_simplifier.h"
 
 namespace kps = phi::kps;
@@ -27,7 +28,8 @@ namespace kps = phi::kps;
 namespace phi {
 namespace funcs {
 
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || defined(__xpu__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || \
+    defined(__xpu__)
 
 enum BroadcastLoadType { kMixed = 1, kBroadcast = 2, kElementwise = 3 };
 
diff --git a/paddle/phi/kernels/funcs/elementwise_base.h b/paddle/phi/kernels/funcs/elementwise_base.h
index 683696f810c80..08d59cc2569d4 100644
--- a/paddle/phi/kernels/funcs/elementwise_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_base.h
@@ -22,7 +22,8 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/elementwise_utils.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || defined(__xpu__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || \
+    defined(__xpu__)
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
 #include "paddle/phi/kernels/funcs/function_traits.h"
@@ -486,7 +487,8 @@ inline void ElementwiseGradPreProcess(const DenseTensor &dout,
   }
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || defined(__xpu__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || \
+    defined(__xpu__)
 
 // static unroller
 template <template <int Index, int VecSize> typename Func,
diff --git a/paddle/phi/kernels/funcs/gather_scatter_functor.h b/paddle/phi/kernels/funcs/gather_scatter_functor.h
index ee0581fe17cf9..56068f9459ebd 100644
--- a/paddle/phi/kernels/funcs/gather_scatter_functor.h
+++ b/paddle/phi/kernels/funcs/gather_scatter_functor.h
@@ -27,7 +27,7 @@ namespace funcs {
       Instantiate_Template_Function_index_t(                                 \
           func, double) Instantiate_Template_Function_index_t(func, int64_t) \
           Instantiate_Template_Function_index_t(func, phi::dtype::float16)   \
-          Instantiate_Template_Function_index_t(func,                        \
+              Instantiate_Template_Function_index_t(func,                    \
                                                     phi::dtype::bfloat16)    \
                   Instantiate_Template_Function_index_t(func, unsigned char)
 
diff --git a/paddle/phi/kernels/funcs/index_calculator.h b/paddle/phi/kernels/funcs/index_calculator.h
index 7a108d3756b3a..d6522bd05c0a8 100644
--- a/paddle/phi/kernels/funcs/index_calculator.h
+++ b/paddle/phi/kernels/funcs/index_calculator.h
@@ -15,7 +15,8 @@
 #pragma once
 
 // CUDA, XPU and HIP use same api
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || defined(__xpu__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || \
+    defined(__xpu__)
 
 #include <algorithm>
 #include <cmath>
diff --git a/paddle/phi/kernels/funcs/math_function.cc b/paddle/phi/kernels/funcs/math_function.cc
index 4d2e25b3dc126..bc365387bdb81 100644
--- a/paddle/phi/kernels/funcs/math_function.cc
+++ b/paddle/phi/kernels/funcs/math_function.cc
@@ -239,8 +239,8 @@ void set_constant(const phi::DeviceContext& context,
     return;
   }
 #endif
-#if defined(PADDLE_WITH_CUDA) || \
-    defined(PADDLE_WITH_HIP)  // || defined(PADDLE_WITH_MUSA )
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   // tensor->place().apply_visitor(func);
   phi::VisitPlace(tensor->place(), func);
 #elif defined(PADDLE_WITH_XPU)
diff --git a/paddle/phi/kernels/funcs/reduce_function.h b/paddle/phi/kernels/funcs/reduce_function.h
index cda3b03732247..7b7a2dd28d2a4 100644
--- a/paddle/phi/kernels/funcs/reduce_function.h
+++ b/paddle/phi/kernels/funcs/reduce_function.h
@@ -15,7 +15,8 @@
 #pragma once
 
 // CUDA, XPU and HIP use same api
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || defined(__xpu__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || \
+    defined(__xpu__)
 
 #include <algorithm>
 #include <cmath>
@@ -71,7 +72,8 @@ using dim3 = phi::kps::dim3;
 namespace phi {
 namespace funcs {
 
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || defined(__xpu__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || \
+    defined(__xpu__)
 namespace details {
 
 // Check if reduce rand is valid
diff --git a/paddle/phi/kernels/gpu/activation_kernel.cu b/paddle/phi/kernels/gpu/activation_kernel.cu
index d741549bebcf9..dd4e3bcfba233 100644
--- a/paddle/phi/kernels/gpu/activation_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_kernel.cu
@@ -186,7 +186,7 @@ PD_REGISTER_KERNEL(relu,
                    float,
                    double,
                    phi::dtype::float16) {}
-#else // CUDA & MUSA
+#else  // CUDA & MUSA
 PD_REGISTER_KERNEL(relu,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu
index e1c660e674427..a199f529082f1 100644
--- a/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu
@@ -137,7 +137,7 @@ PD_REGISTER_KERNEL(layer_norm_grad,
     kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
   }
 }
-#else // CUDA & MUSA
+#else  // CUDA & MUSA
 PD_REGISTER_KERNEL(layer_norm_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/rnn_kernel.cu.cc b/paddle/phi/kernels/gpu/rnn_kernel.cu.cc
index eb2a1b9d4bce7..ebb748de9fcb2 100644
--- a/paddle/phi/kernels/gpu/rnn_kernel.cu.cc
+++ b/paddle/phi/kernels/gpu/rnn_kernel.cu.cc
@@ -405,7 +405,7 @@ void RnnKernel(const Context &dev_ctx,
 PD_REGISTER_KERNEL(rnn, GPU, ALL_LAYOUT, phi::RnnKernel, float) {
   kernel->OutputAt(1).SetDataType(phi::DataType::UINT8);
 }
-#else // CUDA & MUSA
+#else  // CUDA & MUSA
 PD_REGISTER_KERNEL(rnn, GPU, ALL_LAYOUT, phi::RnnKernel, float, double) {
   kernel->OutputAt(1).SetDataType(phi::DataType::UINT8);
 }
diff --git a/paddle/phi/kernels/gpu/top_k_kernel.cu b/paddle/phi/kernels/gpu/top_k_kernel.cu
index 0bbbd079f9738..731d1b9605b70 100644
--- a/paddle/phi/kernels/gpu/top_k_kernel.cu
+++ b/paddle/phi/kernels/gpu/top_k_kernel.cu
@@ -198,7 +198,7 @@ void TopkKernel(const Context& dev_ctx,
                                                       gridx,
                                                       input_height,
                                                       largest));
-#else // CUDA & MUSA
+#else  // CUDA & MUSA
       FIXED_BLOCK_DIM(switch (phi::funcs::getMaxLength(k)) {
         FIXED_MAXLENGTH(
             phi::funcs::KeMatrixTopK<T, maxLength, kBlockDim>
@@ -307,7 +307,7 @@ void TopkKernel(const Context& dev_ctx,
                                                       gridx,
                                                       input_height,
                                                       largest));
-#else // CUDA & MUSA
+#else  // CUDA & MUSA
       FIXED_BLOCK_DIM(switch (phi::funcs::getMaxLength(k)) {
         FIXED_MAXLENGTH(phi::funcs::KeMatrixTopK<T, maxLength, kBlockDim>
                         <<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
index 8e71d9f619a5f..a5a5f40063271 100644
--- a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
@@ -228,7 +228,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
     workspace_size = std::max(workspace_size, search1::GetWorkspaceSize(args1));
     fwd_result.algo =
         search1::Find<T>(args1, false, deterministic, workspace_size, ctx);
-#else // MUSA & CUDA
+#else  // MUSA & CUDA
     using search1 = SearchAlgorithm<ConvKind::kForward>;
     fwd_result = search1::Find<T>(ctx, args1, false, deterministic, false);
     workspace_size = std::max(
@@ -253,7 +253,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
     workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2));
     filter_result.algo =
         search2::Find<T>(args2, false, deterministic, workspace_size, ctx);
-#else // CUDA & MUSA
+#else  // CUDA & MUSA
     using search2 = SearchAlgorithm<ConvKind::kBackwardFilter>;
     filter_result = search2::Find<T>(ctx, args2, false, deterministic, false);
     workspace_size = std::max(
@@ -349,7 +349,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
       };
       workspace_handle.RunFunc(cudnn_func, workspace_size);
     }
-#else   // CUDA & MUSA
+#else  // CUDA & MUSA
     ConvRunner<T, ConvKind::kBackwardFilter>::Apply(ctx,
                                                     args2,
                                                     filter_result,
@@ -700,7 +700,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
     workspace_size = search1::GetWorkspaceSize(args1);
     bwd_result1.algo =
         search1::Find<T>(args1, false, deterministic, workspace_size, ctx);
-#else // CUDA & MUSA
+#else  // CUDA & MUSA
     using search1 = SearchAlgorithm<ConvKind::kBackwardData>;
     bwd_result1 = search1::Find<T>(ctx, args1, false, deterministic, false);
     workspace_size = search1::GetWorkspaceSize(args1, bwd_result1.algo);
@@ -722,7 +722,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
     workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2));
     bwd_result2.algo =
         search2::Find<T>(args2, false, deterministic, workspace_size, ctx);
-#else // CUDA & MUSA
+#else  // CUDA & MUSA
     using search2 = SearchAlgorithm<ConvKind::kBackwardData>;
     bwd_result2 = search2::Find<T>(ctx, args2, false, deterministic, false);
     workspace_size = std::max(
@@ -747,7 +747,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
     workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3));
     filter_result.algo =
         search3::Find<T>(args3, false, deterministic, workspace_size, ctx);
-#else // CUDA & MUSA
+#else  // CUDA & MUSA
     using search3 = SearchAlgorithm<ConvKind::kBackwardFilter>;
     filter_result = search3::Find<T>(ctx, args3, false, deterministic, false);
     workspace_size = std::max(
@@ -773,7 +773,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
     workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4));
     fwd_result.algo =
         search4::Find<T>(args4, false, deterministic, workspace_size, ctx);
-#else // CUDA & MUSA
+#else  // CUDA & MUSA
     using search4 = SearchAlgorithm<ConvKind::kForward>;
     fwd_result = search4::Find<T>(ctx, args4, false, deterministic, false);
     workspace_size = std::max(
@@ -833,7 +833,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
           },
           workspace_size);
     }
-#else   // CUDA & MUSA
+#else  // CUDA & MUSA
     ConvRunner<T, ConvKind::kBackwardData>::Apply(ctx,
                                                   args1,
                                                   bwd_result1,
@@ -886,7 +886,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
           args2.idesc.desc(),
           transformed_ddout_channel_ + i * group_offset_out));
     }
-#else   // CUDA & MUSA
+#else  // CUDA & MUSA
     ConvRunner<T, ConvKind::kBackwardData>::Apply(ctx,
                                                   args2,
                                                   bwd_result2,
@@ -956,7 +956,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
           },
           workspace_size);
     }
-#else // MUSA & CUDA
+#else  // MUSA & CUDA
     ConvRunner<T, ConvKind::kBackwardFilter>::Apply(ctx,
                                                     args3,
                                                     filter_result,
@@ -996,7 +996,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
           },
           workspace_size);
     }
-#else   // MUSA & CUDA
+#else  // MUSA & CUDA
     ConvRunner<T, ConvKind::kForward>::Apply(ctx,
                                              args4,
                                              fwd_result,
@@ -1097,7 +1097,7 @@ PD_REGISTER_KERNEL(conv3d_transpose_grad,
                    double,
                    float16,
                    phi::dtype::bfloat16) {}
-#else // CUDA & MUSA
+#else  // CUDA & MUSA
 PD_REGISTER_KERNEL(conv2d_transpose_grad,
                    GPUDNN,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
index 13348a3e44783..494145a30414c 100644
--- a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
@@ -262,7 +262,7 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
     };
     workspace_handle.RunFunc(cudnn_func, workspace_size);
   }
-#else   // CUDA & MUSA
+#else  // CUDA & MUSA
   ConvRunner<T, ConvKind::kBackwardData>::Apply(ctx,
                                                 args,
                                                 bwd_result,
@@ -385,7 +385,7 @@ PD_REGISTER_KERNEL(conv3d_transpose,
                    double,
                    float16,
                    phi::dtype::bfloat16) {}
-#else // CUDA & MUSA
+#else  // CUDA & MUSA
 PD_REGISTER_KERNEL(conv2d_transpose,
                    GPUDNN,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu b/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu
index dbf2f45bb9d9a..10c1b148d5ab5 100644
--- a/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu
@@ -424,7 +424,7 @@ PD_REGISTER_KERNEL(pool3d_grad,
                    phi::Pool3dGradGPUDNNKernel,
                    float,
                    float16) {}
-#else // CUDA & MUSA
+#else  // CUDA & MUSA
 PD_REGISTER_KERNEL(pool2d_grad,
                    GPUDNN,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpudnn/pool_kernel.cu b/paddle/phi/kernels/gpudnn/pool_kernel.cu
index caeecbaca9734..da4dfd0ce2bde 100644
--- a/paddle/phi/kernels/gpudnn/pool_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/pool_kernel.cu
@@ -295,7 +295,7 @@ PD_REGISTER_KERNEL(
     pool2d, GPUDNN, ALL_LAYOUT, phi::Pool2dGPUDNNKernel, float, float16) {}
 PD_REGISTER_KERNEL(
     pool3d, GPUDNN, ALL_LAYOUT, phi::Pool3dGPUDNNKernel, float, float16) {}
-#else // CUDA & MUSA
+#else  // CUDA & MUSA
 PD_REGISTER_KERNEL(pool2d,
                    GPUDNN,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpudnn/softmax_grad_kernel.cu b/paddle/phi/kernels/gpudnn/softmax_grad_kernel.cu
index 93dff54fa128c..94a5c9bf5d54d 100644
--- a/paddle/phi/kernels/gpudnn/softmax_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/softmax_grad_kernel.cu
@@ -59,7 +59,7 @@ PD_REGISTER_KERNEL(softmax_grad,
                    double,
                    phi::dtype::float16,
                    phi::dtype::bfloat16) {}
-#else // CUDA & MUSA
+#else  // CUDA & MUSA
 PD_REGISTER_KERNEL(softmax_grad,
                    GPUDNN,
                    ALL_LAYOUT,

From 8d4f88a0004fc2b74b2a920b932a1fb9324d8ef7 Mon Sep 17 00:00:00 2001
From: CaiZhi <zhi.cai@mthreads.com>
Date: Wed, 16 Aug 2023 10:58:13 +0800
Subject: [PATCH 52/55] [MTAI-484] feat(build): fix code style for cpp lint

---
 paddle/phi/backends/CMakeLists.txt                | 4 +++-
 paddle/phi/backends/dynload/mublas.cc             | 1 -
 paddle/phi/backends/dynload/musa_driver.h         | 1 -
 paddle/phi/core/hostdevice.h                      | 3 ++-
 paddle/phi/kernels/funcs/layer_norm_impl.cu.h     | 8 ++++----
 paddle/phi/kernels/gpu/check_numerics_kernel.cu   | 2 +-
 paddle/phi/kernels/impl/elementwise_kernel_impl.h | 3 ++-
 paddle/phi/kernels/impl/matmul_kernel_impl.h      | 1 -
 8 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/paddle/phi/backends/CMakeLists.txt b/paddle/phi/backends/CMakeLists.txt
index beb0f88e3efcf..5e14c15e8cb26 100644
--- a/paddle/phi/backends/CMakeLists.txt
+++ b/paddle/phi/backends/CMakeLists.txt
@@ -7,7 +7,9 @@ if(NOT APPLE AND NOT WIN32)
   list(APPEND BACKENDS_SRCS device_code.cc)
 endif()
 
-if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
+if(WITH_GPU
+   OR WITH_ROCM
+   OR WITH_MUSA)
   list(APPEND BACKENDS_SRCS gpu/gpu_context.cc gpu/gpu_info.cc
        gpu/gpu_resources.cc)
   if(WITH_GPU)
diff --git a/paddle/phi/backends/dynload/mublas.cc b/paddle/phi/backends/dynload/mublas.cc
index 5952fb0f2f422..72c0e9954311e 100644
--- a/paddle/phi/backends/dynload/mublas.cc
+++ b/paddle/phi/backends/dynload/mublas.cc
@@ -25,4 +25,3 @@ MUBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP);
 
 }  // namespace dynload
 }  // namespace phi
-
diff --git a/paddle/phi/backends/dynload/musa_driver.h b/paddle/phi/backends/dynload/musa_driver.h
index 69ce81ae99bf1..3534ab8213c93 100644
--- a/paddle/phi/backends/dynload/musa_driver.h
+++ b/paddle/phi/backends/dynload/musa_driver.h
@@ -61,7 +61,6 @@ extern bool HasCUDADriver();
   __macro(muDeviceGetAttribute);                        \
   __macro(muDeviceGet);
 
-
 MUSA_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MUSA_WRAP);
 
 #undef DECLARE_DYNAMIC_LOAD_MUSA_WRAP
diff --git a/paddle/phi/core/hostdevice.h b/paddle/phi/core/hostdevice.h
index 81e663fa20df6..85feb0d060439 100644
--- a/paddle/phi/core/hostdevice.h
+++ b/paddle/phi/core/hostdevice.h
@@ -30,7 +30,8 @@
 #include "xpu/kernel/math.h"
 #endif
 
-#if (defined(__CUDACC__) || defined(__HIPCC__) || defined(__MUSACC__) || defined(__xpu__))
+#if (defined(__CUDACC__) || defined(__HIPCC__) || defined(__MUSACC__) || \
+     defined(__xpu__))
 #define HOSTDEVICE __host__ __device__
 #define DEVICE __device__
 #define HOST __host__
diff --git a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
index a810833c36ac3..6f73551ab6766 100644
--- a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
+++ b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
@@ -1353,8 +1353,8 @@ __global__ void LayerNormBackwardComputeGradInput(const T *__restrict__ dout,
       // WARP_SHFL_XOR(sum_loss, mask);
       sum_loss1 += __shfl_xor(sum_loss1, mask, warpSize);
       sum_loss2 += __shfl_xor(sum_loss2, mask, warpSize);
-#else // CUDA and MUSA
-      // WARP_SHFL_XOR(sum_loss, mask);
+#else  // CUDA and MUSA
+       // WARP_SHFL_XOR(sum_loss, mask);
       sum_loss1 += __shfl_xor_sync(0xffffffff, sum_loss1, mask, warpSize);
       sum_loss2 += __shfl_xor_sync(0xffffffff, sum_loss2, mask, warpSize);
 #endif
@@ -1504,8 +1504,8 @@ __global__ void LayerNormBackwardComputeGradInputWithSmallFeatureSize(
       // WARP_SHFL_XOR(sum_loss, mask);
       sum_loss1 += __shfl_xor(sum_loss1, mask, warpSize);
       sum_loss2 += __shfl_xor(sum_loss2, mask, warpSize);
-#else // CUDA and MUSA
-      // WARP_SHFL_XOR(sum_loss, mask);
+#else  // CUDA and MUSA
+       // WARP_SHFL_XOR(sum_loss, mask);
       sum_loss1 += __shfl_xor_sync(0xffffffff, sum_loss1, mask, WarpSize);
       sum_loss2 += __shfl_xor_sync(0xffffffff, sum_loss2, mask, WarpSize);
 #endif
diff --git a/paddle/phi/kernels/gpu/check_numerics_kernel.cu b/paddle/phi/kernels/gpu/check_numerics_kernel.cu
index 3052e89820bec..c69e3d13ff684 100644
--- a/paddle/phi/kernels/gpu/check_numerics_kernel.cu
+++ b/paddle/phi/kernels/gpu/check_numerics_kernel.cu
@@ -503,7 +503,7 @@ void CheckNumericsKernel(const Context& ctx,
 #ifdef PADDLE_WITH_MUSA
   PADDLE_THROW(phi::errors::Unimplemented(
       "OP check_numerics is unsupported for MUSA backend now!"));
-return;
+  return;
 #else
   int dev_id = tensor.place().device;
   VLOG(6) << "op_type=" << op_type << ", var_name=" << var_name
diff --git a/paddle/phi/kernels/impl/elementwise_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_kernel_impl.h
index 742646ba05730..d2992449e4b4e 100644
--- a/paddle/phi/kernels/impl/elementwise_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_kernel_impl.h
@@ -17,7 +17,8 @@
 #include "paddle/phi/kernels/elementwise_kernel.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || defined(__xpu__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || \
+    defined(__xpu__)
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #endif
 
diff --git a/paddle/phi/kernels/impl/matmul_kernel_impl.h b/paddle/phi/kernels/impl/matmul_kernel_impl.h
index b88c1dffdd018..fa96a63d955d9 100644
--- a/paddle/phi/kernels/impl/matmul_kernel_impl.h
+++ b/paddle/phi/kernels/impl/matmul_kernel_impl.h
@@ -1006,7 +1006,6 @@ void MatmulWithFlattenKernel(const Context& dev_ctx,
   }
 
   auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
- 
   blas.MatMul(x_matrix, y_matrix, out);
   if (z_dim.size() != 2) {
     out->Resize(z_dim);

From ce746319acf28c3674e0f7329f07c81dc3429793 Mon Sep 17 00:00:00 2001
From: CaiZhi <zhi.cai@mthreads.com>
Date: Wed, 16 Aug 2023 10:58:13 +0800
Subject: [PATCH 53/55] [MTAI-484] feat(build): fix code style for cpp lint

---
 paddle/phi/backends/CMakeLists.txt                |  4 +++-
 paddle/phi/backends/dynload/mublas.cc             |  1 -
 paddle/phi/backends/dynload/musa_driver.h         |  1 -
 paddle/phi/common/cpstring_impl.h                 |  3 ++-
 paddle/phi/core/hostdevice.h                      |  3 ++-
 paddle/phi/kernels/funcs/embedding_grad.h         | 10 +++++-----
 paddle/phi/kernels/funcs/layer_norm_impl.cu.h     |  8 ++++----
 paddle/phi/kernels/gpu/check_numerics_kernel.cu   |  2 +-
 paddle/phi/kernels/impl/elementwise_kernel_impl.h |  3 ++-
 paddle/phi/kernels/impl/matmul_kernel_impl.h      |  1 -
 10 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/paddle/phi/backends/CMakeLists.txt b/paddle/phi/backends/CMakeLists.txt
index beb0f88e3efcf..5e14c15e8cb26 100644
--- a/paddle/phi/backends/CMakeLists.txt
+++ b/paddle/phi/backends/CMakeLists.txt
@@ -7,7 +7,9 @@ if(NOT APPLE AND NOT WIN32)
   list(APPEND BACKENDS_SRCS device_code.cc)
 endif()
 
-if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
+if(WITH_GPU
+   OR WITH_ROCM
+   OR WITH_MUSA)
   list(APPEND BACKENDS_SRCS gpu/gpu_context.cc gpu/gpu_info.cc
        gpu/gpu_resources.cc)
   if(WITH_GPU)
diff --git a/paddle/phi/backends/dynload/mublas.cc b/paddle/phi/backends/dynload/mublas.cc
index 5952fb0f2f422..72c0e9954311e 100644
--- a/paddle/phi/backends/dynload/mublas.cc
+++ b/paddle/phi/backends/dynload/mublas.cc
@@ -25,4 +25,3 @@ MUBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP);
 
 }  // namespace dynload
 }  // namespace phi
-
diff --git a/paddle/phi/backends/dynload/musa_driver.h b/paddle/phi/backends/dynload/musa_driver.h
index 69ce81ae99bf1..3534ab8213c93 100644
--- a/paddle/phi/backends/dynload/musa_driver.h
+++ b/paddle/phi/backends/dynload/musa_driver.h
@@ -61,7 +61,6 @@ extern bool HasCUDADriver();
   __macro(muDeviceGetAttribute);                        \
   __macro(muDeviceGet);
 
-
 MUSA_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MUSA_WRAP);
 
 #undef DECLARE_DYNAMIC_LOAD_MUSA_WRAP
diff --git a/paddle/phi/common/cpstring_impl.h b/paddle/phi/common/cpstring_impl.h
index cbbd632aa2484..b57b485f43bc4 100644
--- a/paddle/phi/common/cpstring_impl.h
+++ b/paddle/phi/common/cpstring_impl.h
@@ -77,7 +77,8 @@ HOSTDEVICE static inline uint32_t swap32(uint32_t host_int) {
 }
 #endif
 
-#if PD_PSTRING_LITTLE_ENDIAN || (defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__))
+#if PD_PSTRING_LITTLE_ENDIAN || \
+    (defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__))
 #define PD_le32toh(x) x
 #else  // PD_PSTRING_LITTLE_ENDIAN
 #define PD_le32toh(x) swap32(x)
diff --git a/paddle/phi/core/hostdevice.h b/paddle/phi/core/hostdevice.h
index 81e663fa20df6..85feb0d060439 100644
--- a/paddle/phi/core/hostdevice.h
+++ b/paddle/phi/core/hostdevice.h
@@ -30,7 +30,8 @@
 #include "xpu/kernel/math.h"
 #endif
 
-#if (defined(__CUDACC__) || defined(__HIPCC__) || defined(__MUSACC__) || defined(__xpu__))
+#if (defined(__CUDACC__) || defined(__HIPCC__) || defined(__MUSACC__) || \
+     defined(__xpu__))
 #define HOSTDEVICE __host__ __device__
 #define DEVICE __device__
 #define HOST __host__
diff --git a/paddle/phi/kernels/funcs/embedding_grad.h b/paddle/phi/kernels/funcs/embedding_grad.h
index 8d43553325277..6762a891e3dcd 100644
--- a/paddle/phi/kernels/funcs/embedding_grad.h
+++ b/paddle/phi/kernels/funcs/embedding_grad.h
@@ -96,9 +96,9 @@ __global__ void EmbeddingGradDeterministicKernel(T* table,
         unsigned long long int matchmask =      // NOLINT
             __ballot(match_found_this_thread);  // NOLINT
         int first_remaining_peer = __ffsll(matchmask) - 1;
-#else // MUSA and CUDA
-        // If and only if match_found_this_thread of the Nth thread is non-zero,
-        // set the Nth bit of matchmask to 1.
+#else  // MUSA and CUDA
+       // If and only if match_found_this_thread of the Nth thread is non-zero,
+       // set the Nth bit of matchmask to 1.
         unsigned int matchmask =
             __ballot_sync(0xffffffff, match_found_this_thread);
         // Find the position of the first bit set to 1 in matchmask.
@@ -112,7 +112,7 @@ __global__ void EmbeddingGradDeterministicKernel(T* table,
           while (matchmask) {
 #ifdef PADDLE_WITH_HIP
             first_remaining_peer = __ffsll(matchmask) - 1;
-#else // CUDA and MUSA
+#else  // CUDA and MUSA
             first_remaining_peer = __ffs(matchmask) - 1;
 #endif
             my_s[threadIdx.x] +=
@@ -142,7 +142,7 @@ void LaunchEmbeddingGradDeterministicKernel(const GPUContext& ctx,
 #ifdef PADDLE_WITH_HIP
   constexpr int kWarpSize = 64;
   constexpr int kBlockDimY = 16;
-#else // CUDA and MUSA
+#else  // CUDA and MUSA
   constexpr int kWarpSize = 32;
   constexpr int kBlockDimY = 32;
 #endif
diff --git a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
index a810833c36ac3..6f73551ab6766 100644
--- a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
+++ b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
@@ -1353,8 +1353,8 @@ __global__ void LayerNormBackwardComputeGradInput(const T *__restrict__ dout,
       // WARP_SHFL_XOR(sum_loss, mask);
       sum_loss1 += __shfl_xor(sum_loss1, mask, warpSize);
       sum_loss2 += __shfl_xor(sum_loss2, mask, warpSize);
-#else // CUDA and MUSA
-      // WARP_SHFL_XOR(sum_loss, mask);
+#else  // CUDA and MUSA
+       // WARP_SHFL_XOR(sum_loss, mask);
       sum_loss1 += __shfl_xor_sync(0xffffffff, sum_loss1, mask, warpSize);
       sum_loss2 += __shfl_xor_sync(0xffffffff, sum_loss2, mask, warpSize);
 #endif
@@ -1504,8 +1504,8 @@ __global__ void LayerNormBackwardComputeGradInputWithSmallFeatureSize(
       // WARP_SHFL_XOR(sum_loss, mask);
       sum_loss1 += __shfl_xor(sum_loss1, mask, warpSize);
       sum_loss2 += __shfl_xor(sum_loss2, mask, warpSize);
-#else // CUDA and MUSA
-      // WARP_SHFL_XOR(sum_loss, mask);
+#else  // CUDA and MUSA
+       // WARP_SHFL_XOR(sum_loss, mask);
       sum_loss1 += __shfl_xor_sync(0xffffffff, sum_loss1, mask, WarpSize);
       sum_loss2 += __shfl_xor_sync(0xffffffff, sum_loss2, mask, WarpSize);
 #endif
diff --git a/paddle/phi/kernels/gpu/check_numerics_kernel.cu b/paddle/phi/kernels/gpu/check_numerics_kernel.cu
index 3052e89820bec..c69e3d13ff684 100644
--- a/paddle/phi/kernels/gpu/check_numerics_kernel.cu
+++ b/paddle/phi/kernels/gpu/check_numerics_kernel.cu
@@ -503,7 +503,7 @@ void CheckNumericsKernel(const Context& ctx,
 #ifdef PADDLE_WITH_MUSA
   PADDLE_THROW(phi::errors::Unimplemented(
       "OP check_numerics is unsupported for MUSA backend now!"));
-return;
+  return;
 #else
   int dev_id = tensor.place().device;
   VLOG(6) << "op_type=" << op_type << ", var_name=" << var_name
diff --git a/paddle/phi/kernels/impl/elementwise_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_kernel_impl.h
index 742646ba05730..d2992449e4b4e 100644
--- a/paddle/phi/kernels/impl/elementwise_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_kernel_impl.h
@@ -17,7 +17,8 @@
 #include "paddle/phi/kernels/elementwise_kernel.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || defined(__xpu__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || \
+    defined(__xpu__)
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #endif
 
diff --git a/paddle/phi/kernels/impl/matmul_kernel_impl.h b/paddle/phi/kernels/impl/matmul_kernel_impl.h
index b88c1dffdd018..fa96a63d955d9 100644
--- a/paddle/phi/kernels/impl/matmul_kernel_impl.h
+++ b/paddle/phi/kernels/impl/matmul_kernel_impl.h
@@ -1006,7 +1006,6 @@ void MatmulWithFlattenKernel(const Context& dev_ctx,
   }
 
   auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
- 
   blas.MatMul(x_matrix, y_matrix, out);
   if (z_dim.size() != 2) {
     out->Resize(z_dim);

From f8bc349b66dc4b4d0cae55e0b44ba8cd4f591494 Mon Sep 17 00:00:00 2001
From: CaiZhi <zhi.cai@mthreads.com>
Date: Wed, 16 Aug 2023 18:42:05 +0800
Subject: [PATCH 54/55] [MTAI-484] feat(build): resolve comments about mudnn

---
 paddle/phi/backends/dynload/mudnn.cc | 21 +++------------------
 paddle/phi/backends/dynload/mudnn.h  | 24 ------------------------
 2 files changed, 3 insertions(+), 42 deletions(-)

diff --git a/paddle/phi/backends/dynload/mudnn.cc b/paddle/phi/backends/dynload/mudnn.cc
index 4e127b8cc001c..87b51cb8bb56a 100644
--- a/paddle/phi/backends/dynload/mudnn.cc
+++ b/paddle/phi/backends/dynload/mudnn.cc
@@ -14,28 +14,13 @@ limitations under the License. */
 
 #include "paddle/phi/backends/dynload/mudnn.h"
 
-#include "paddle/phi/core/enforce.h"
-
 namespace phi {
 namespace dynload {
 
-std::once_flag mudnn_dso_flag;
-void* mudnn_dso_handle = nullptr;
-
-#define DEFINE_WRAP(__name) DynLoad__##__name __name
-
 bool HasCUDNN() {
-  std::call_once(mudnn_dso_flag,
-                 []() { mudnn_dso_handle = GetCUDNNDsoHandle(); });
-  return mudnn_dso_handle != nullptr;
-}
-
-void EnforceCUDNNLoaded(const char* fn_name) {
-  PADDLE_ENFORCE_NOT_NULL(
-      mudnn_dso_handle,
-      phi::errors::PreconditionNotMet(
-          "Cannot load mudnn shared library. Cannot invoke method %s.",
-          fn_name));
+  // note: mudnn.so is not imported by dlopen, which will be linked
+  // in cmakelist.txt.
+  return true;
 }
 
 }  // namespace dynload
diff --git a/paddle/phi/backends/dynload/mudnn.h b/paddle/phi/backends/dynload/mudnn.h
index ed4142d61dea4..66ba6a21b28cf 100644
--- a/paddle/phi/backends/dynload/mudnn.h
+++ b/paddle/phi/backends/dynload/mudnn.h
@@ -14,36 +14,12 @@ limitations under the License. */
 
 #pragma once
 #ifdef PADDLE_WITH_MUSA
-#include <mudnn.h>
-
-#include <mutex>  // NOLINT
-
-#include "paddle/phi/backends/dynload/dynamic_loader.h"
-#include "paddle/phi/backends/dynload/port.h"
 
 namespace phi {
 namespace dynload {
 
-extern std::once_flag mudnn_dso_flag;
-extern void* mudnn_dso_handle;
 extern bool HasCUDNN();
 
-extern void EnforceCUDNNLoaded(const char* fn_name);
-#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)                      \
-  struct DynLoad__##__name {                                         \
-    template <typename... Args>                                      \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
-      using mudnn_func = decltype(&::__name);                        \
-      std::call_once(mudnn_dso_flag, []() {                          \
-        mudnn_dso_handle = phi::dynload::GetCUDNNDsoHandle();        \
-      });                                                            \
-      EnforceCUDNNLoaded(#__name);                                   \
-      static void* p_##__name = dlsym(mudnn_dso_handle, #__name);    \
-      return reinterpret_cast<mudnn_func>(p_##__name)(args...);      \
-    }                                                                \
-  };                                                                 \
-  extern struct DynLoad__##__name __name
-
 }  // namespace dynload
 }  // namespace phi
 #endif

From 51c52af913d510e39e1fbf1e7b4eeb4a14eb2f48 Mon Sep 17 00:00:00 2001
From: CaiZhi <zhi.cai@mthreads.com>
Date: Thu, 17 Aug 2023 01:07:13 +0800
Subject: [PATCH 55/55] [MTAI-484] feat(build): support cpp test for MUSA (#53)

---
 cmake/generic.cmake                           | 40 ++++++++++++++--
 cmake/musa.cmake                              |  4 ++
 paddle/fluid/memory/allocation/CMakeLists.txt | 13 +++++
 .../memory/allocation/buddy_allocator_test.cc |  9 ++++
 paddle/fluid/platform/enforce_test.cc         | 48 +++++++++++++++++++
 paddle/phi/backends/dynload/dynamic_loader.cc |  2 +
 test/CMakeLists.txt                           |  3 ++
 7 files changed, 115 insertions(+), 4 deletions(-)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 117cee63daf4f..56c9c0de2f24b 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -425,6 +425,9 @@ function(cc_binary TARGET_NAME)
   if(WITH_ROCM)
     target_link_libraries(${TARGET_NAME} ${ROCM_HIPRTC_LIB})
   endif()
+  if(WITH_MUSA)
+    target_link_libraries(${TARGET_NAME} ${MUSARTC_LIB})
+  endif()
 
   check_coverage_opt(${TARGET_NAME} ${cc_binary_SRCS})
 
@@ -452,6 +455,9 @@ function(cc_test_build TARGET_NAME)
     if(WITH_ROCM)
       target_link_libraries(${TARGET_NAME} ${ROCM_HIPRTC_LIB})
     endif()
+    if(WITH_MUSA)
+      target_link_libraries(${TARGET_NAME} ${MUSARTC_LIB})
+    endif()
     check_coverage_opt(${TARGET_NAME} ${cc_test_SRCS})
   endif()
 endfunction()
@@ -840,17 +846,43 @@ function(musa_test TARGET_NAME)
     set(multiValueArgs SRCS DEPS)
     cmake_parse_arguments(musa_test "${options}" "${oneValueArgs}"
                           "${multiValueArgs}" ${ARGN})
-    add_executable(${TARGET_NAME} ${musa_test_SRCS})
+    musa_add_executable(${TARGET_NAME} ${musa_test_SRCS})
+    # "-pthread -ldl -lrt" is defined in CMAKE_CXX_LINK_EXECUTABLE
+    target_link_options(${TARGET_NAME} PRIVATE -pthread -ldl -lrt)
     get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
-    target_link_libraries(${TARGET_NAME} ${musa_test_DEPS}
-                          ${os_dependency_modules} paddle_gtest_main phi)
-    add_dependencies(${TARGET_NAME} ${musa_test_DEPS} paddle_gtest_main)
+    target_link_libraries(
+      ${TARGET_NAME}
+      ${musa_test_DEPS}
+      paddle_gtest_main
+      lod_tensor
+      memory
+      gtest
+      glog
+      phi
+      ${os_dependency_modules})
+    add_dependencies(
+      ${TARGET_NAME}
+      ${musa_test_DEPS}
+      paddle_gtest_main
+      lod_tensor
+      memory
+      gtest
+      phi
+      glog)
     common_link(${TARGET_NAME})
     add_test(${TARGET_NAME} ${TARGET_NAME})
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT
                                               FLAGS_cpu_deterministic=true)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT
                                               FLAGS_init_allocated_mem=true)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT
+                                              FLAGS_cudnn_deterministic=true)
+    set_property(
+      TEST ${TARGET_NAME}
+      PROPERTY
+        ENVIRONMENT
+        "LD_LIBRARY_PATH=${CMAKE_BINARY_DIR}/python/paddle/libs:$LD_LIBRARY_PATH"
+    )
   endif()
 endfunction()
 
diff --git a/cmake/musa.cmake b/cmake/musa.cmake
index 81116fbbdef8b..fa1268fbce02b 100644
--- a/cmake/musa.cmake
+++ b/cmake/musa.cmake
@@ -117,3 +117,7 @@ if(CMAKE_BUILD_TYPE MATCHES Debug)
   list(APPEND MUSA_MCC_FLAGS -g2)
   list(APPEND MUSA_MCC_FLAGS -O0)
 endif()
+
+set(musa_runtime_library_name musart)
+find_library(MUSARTC_LIB ${musa_runtime_library_name} HINTS ${MUSA_PATH}/lib)
+message(STATUS "MUSARTC_LIB: ${MUSARTC_LIB}")
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index 1b01636672956..c044d25053ba3 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -91,6 +91,10 @@ if(WITH_ROCM)
     SRCS thread_local_allocator_test.cc
     DEPS allocator)
 endif()
+if(WITH_MUSA)
+  musa_test(thread_local_allocator_test SRCS thread_local_allocator_test.cc
+            DEPS allocator)
+endif()
 
 if(WITH_GPU)
   nv_test(
@@ -102,6 +106,15 @@ elseif(WITH_ROCM)
     best_fit_allocator_test
     SRCS best_fit_allocator_test.cc best_fit_allocator_test.cu
     DEPS allocator memcpy)
+elseif(WITH_MUSA)
+  musa_test(
+    best_fit_allocator_test
+    SRCS
+    best_fit_allocator_test.cc
+    best_fit_allocator_test.cu
+    DEPS
+    allocator
+    memcpy)
 else()
   cc_test_old(best_fit_allocator_test SRCS best_fit_allocator_test.cc DEPS
               allocator)
diff --git a/paddle/fluid/memory/allocation/buddy_allocator_test.cc b/paddle/fluid/memory/allocation/buddy_allocator_test.cc
index 84ca071b800a0..e74544e292306 100644
--- a/paddle/fluid/memory/allocation/buddy_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/buddy_allocator_test.cc
@@ -246,6 +246,9 @@ TEST(BuddyAllocator, AllocFromAvailable) {
 #ifdef PADDLE_WITH_HIP
   hipError_t result = hipMalloc(&p, available >> 1);
   EXPECT_TRUE(result == hipSuccess);
+#elif defined(PADDLE_WITH_MUSA)
+  musaError_t result = musaMalloc(&p, available >> 1);
+  EXPECT_TRUE(result == musaSuccess);
 #else
   cudaError_t result = cudaMalloc(&p, available >> 1);
   EXPECT_TRUE(result == cudaSuccess);
@@ -265,6 +268,8 @@ TEST(BuddyAllocator, AllocFromAvailable) {
   if (p) {
 #ifdef PADDLE_WITH_HIP
     EXPECT_TRUE(hipFree(p) == hipSuccess);
+#elif defined(PADDLE_WITH_MUSA)
+    EXPECT_TRUE(musaFree(p) == musaSuccess);
 #else
     EXPECT_TRUE(cudaFree(p) == cudaSuccess);
 #endif
@@ -280,6 +285,8 @@ TEST(BuddyAllocator, AllocFromAvailableWhenFractionIsOne) {
 
 #ifdef PADDLE_WITH_HIP
   EXPECT_TRUE(hipMalloc(&p, static_cast<size_t>(1) << 30) == hipSuccess);
+#elif defined(PADDLE_WITH_MUSA)
+  EXPECT_TRUE(musaMalloc(&p, static_cast<size_t>(1) << 30) == musaSuccess);
 #else
   EXPECT_TRUE(cudaMalloc(&p, static_cast<size_t>(1) << 30) == cudaSuccess);
 #endif
@@ -296,6 +303,8 @@ TEST(BuddyAllocator, AllocFromAvailableWhenFractionIsOne) {
   if (p) {
 #ifdef PADDLE_WITH_HIP
     EXPECT_TRUE(hipFree(p) == hipSuccess);
+#elif defined(PADDLE_WITH_MUSA)
+    EXPECT_TRUE(musaFree(p) == musaSuccess);
 #else
     EXPECT_TRUE(cudaFree(p) == cudaSuccess);
 #endif
diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc
index 346b27d0f9e42..a5fc3786323c6 100644
--- a/paddle/fluid/platform/enforce_test.cc
+++ b/paddle/fluid/platform/enforce_test.cc
@@ -396,6 +396,54 @@ TEST(enforce, hip_success) {
   EXPECT_TRUE(CheckCudaStatusFailure(ncclSystemError, "Rccl error"));
 #endif
 }
+#elif defined(PADDLE_WITH_MUSA)
+TEST(enforce, musa_success) {
+  EXPECT_TRUE(CheckCudaStatusSuccess(musaSuccess));
+  EXPECT_TRUE(CheckCudaStatusFailure(musaErrorInvalidValue, "MUSA error"));
+
+  EXPECT_TRUE(CheckCudaStatusFailure(musaErrorMemoryAllocation, "MUSA error"));
+
+  EXPECT_TRUE(CheckCudaStatusFailure(
+      musaErrorInsufficientDriver,
+      "This indicates that the installed MooreThreads MUSA driver is older "
+      "than the "
+      "MUSA runtime library. This is not a supported configuration.Users "
+      "should install an updated MooreThreads display driver to allow the "
+      "application to run"));
+  EXPECT_TRUE(CheckCudaStatusFailure(
+      musaErrorContextIsDestroyed,
+      "This error indicates that the context current to the calling thread has "
+      "been destroyed using muCtxDestroy, or is a primary context which has "
+      "not yet been initialized"));
+
+  EXPECT_TRUE(CheckCudaStatusSuccess(MURAND_STATUS_SUCCESS));
+  EXPECT_TRUE(
+      CheckCudaStatusFailure(MURAND_STATUS_VERSION_MISMATCH, "MURAND error"));
+  EXPECT_TRUE(
+      CheckCudaStatusFailure(MURAND_STATUS_NOT_CREATED, "MURAND error"));
+  EXPECT_TRUE(
+      CheckCudaStatusFailure(MURAND_STATUS_LENGTH_NOT_MULTIPLE,
+                             "Length requested is not a multple of dimension"));
+
+  EXPECT_TRUE(CheckCudaStatusSuccess(MUBLAS_STATUS_SUCCESS));
+  EXPECT_TRUE(
+      CheckCudaStatusFailure(MUBLAS_STATUS_NOT_IMPLEMENTED, "MUBLAS error"));
+  EXPECT_TRUE(
+      CheckCudaStatusFailure(MUBLAS_STATUS_INVALID_VALUE, "MUBLAS error"));
+
+#if !defined(__APPLE__) && defined(PADDLE_WITH_MCCL)
+  EXPECT_TRUE(CheckCudaStatusSuccess(mcclSuccess));
+  EXPECT_TRUE(CheckCudaStatusFailure(mcclUnhandledMusaError, "MCCL error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(mcclSystemError, "MCCL error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(mcclInternalError,
+                                     "An internal check failed. This is either "
+                                     "a bug in MCCL or due to memory "
+                                     "corruption"));
+  EXPECT_TRUE(CheckCudaStatusFailure(mcclInvalidUsage,
+                                     "The call to MCCL is incorrect. This is "
+                                     "usually reflecting a programming error"));
+#endif
+}
 #else
 TEST(enforce, cuda_success) {
   EXPECT_TRUE(CheckCudaStatusSuccess(cudaSuccess));
diff --git a/paddle/phi/backends/dynload/dynamic_loader.cc b/paddle/phi/backends/dynload/dynamic_loader.cc
index 3c21c5fe69a2f..ac06fb70e57cc 100644
--- a/paddle/phi/backends/dynload/dynamic_loader.cc
+++ b/paddle/phi/backends/dynload/dynamic_loader.cc
@@ -468,6 +468,8 @@ void* GetNVRTCDsoHandle() {
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.dylib", false);
 #elif defined(PADDLE_WITH_HIP)
   return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libamdhip64.so", false);
+#elif defined(PADDLE_WITH_MUSA)
+  return GetDsoHandleFromSearchPath(FLAGS_musa_dir, "libmusart.so", false);
 #else
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.so", false);
 #endif
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 613e3807265a7..3856dd11d0cde 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -189,6 +189,9 @@ if(${len} GREATER_EQUAL 1)
       if(WITH_ROCM)
         target_link_libraries(${test_name} ${ROCM_HIPRTC_LIB})
       endif()
+      if(WITH_MUSA)
+        target_link_libraries(${test_name} ${MUSARTC_LIB})
+      endif()
       if(APPLE)
         target_link_libraries(
           ${test_name}