From 2c611c7c491fd82512e3912ef98e7c3b7f32a245 Mon Sep 17 00:00:00 2001 From: CaiZhi Date: Mon, 24 Jul 2023 12:12:15 +0000 Subject: [PATCH 01/55] [MTAI] build(system): enable build system in paddle for MUSA --- CMakeLists.txt | 31 ++ cmake/flags.cmake | 4 - cmake/generic.cmake | 68 ++++ cmake/mccl.cmake | 31 ++ cmake/mudnn.cmake | 66 ++++ cmake/musa.cmake | 33 ++ .../distributed/fleet_executor/carrier.cc | 2 +- .../fleet_executor/cond_interceptor.cc | 2 +- .../distributed/fleet_executor/dist_model.cc | 2 +- .../eager_generated/backwards/scale_node.cc | 2 +- .../generator/python_c_gen.py | 2 +- paddle/fluid/eager/nan_inf_utils.cc | 2 +- paddle/fluid/framework/conv_search_cache.h | 8 +- .../fluid/framework/copy_same_tensor_test.cc | 2 +- paddle/fluid/framework/custom_operator.cc | 4 +- paddle/fluid/framework/data_feed.cc | 2 +- paddle/fluid/framework/data_feed.h | 2 +- paddle/fluid/framework/data_feed_factory.cc | 2 +- .../fluid/framework/details/build_strategy.cc | 4 +- .../details/eager_deletion_op_handle.cc | 27 +- .../details/eager_deletion_op_handle.h | 2 +- .../details/fetch_async_op_handle.cc | 2 +- .../framework/details/fetch_op_handle.cc | 2 +- .../details/fused_all_reduce_op_handle.cc | 27 +- .../details/gather_op_handle_test.cc | 4 +- .../framework/details/nan_inf_utils_detail.cc | 2 +- .../fluid/framework/details/op_handle_base.cc | 46 ++- .../fluid/framework/details/op_handle_base.h | 2 +- .../details/reduce_op_handle_test.cc | 2 +- .../details/scale_loss_grad_op_handle.cc | 4 +- .../details/share_tensor_buffer_op_handle.cc | 2 +- paddle/fluid/framework/details/var_handle.h | 4 +- paddle/fluid/framework/device_worker.h | 14 +- paddle/fluid/framework/dlpack_tensor.cc | 4 +- paddle/fluid/framework/dlpack_tensor_test.cc | 2 +- paddle/fluid/framework/executor.cc | 2 +- paddle/fluid/framework/fleet/box_wrapper.cu | 7 +- paddle/fluid/framework/fleet/box_wrapper.h | 5 +- .../fluid/framework/fleet/box_wrapper_impl.h | 15 +- paddle/fluid/framework/fleet/fleet_wrapper.cc | 7 +- paddle/fluid/framework/fleet/fleet_wrapper.h | 2 +- paddle/fluid/framework/fleet/heter_wrapper.cc | 6 +- paddle/fluid/framework/fleet/heter_wrapper.h | 2 +- paddle/fluid/framework/garbage_collector.cc | 8 +- paddle/fluid/framework/garbage_collector.h | 2 +- paddle/fluid/framework/ir/cost_model.cc | 4 +- paddle/fluid/framework/ir/fuse_bn_act_pass.cc | 2 +- .../framework/ir/fuse_bn_add_act_pass.cc | 2 +- .../ir/fusion_group/code_generator_tester.cc | 2 +- ...est_reference_count_pass_last_lived_ops.cc | 2 +- .../interpreter/execution_config.cc | 2 +- .../interpreter/interpreter_util.cc | 2 +- .../new_executor/interpreter_base_impl.h | 2 +- .../new_executor/new_ir_interpreter.cc | 4 +- .../fluid/framework/new_executor/profiler.h | 2 +- .../new_executor/program_interpreter.cc | 4 +- paddle/fluid/framework/op_registry.h | 2 +- paddle/fluid/framework/operator.cc | 12 +- paddle/fluid/framework/operator.h | 2 +- paddle/fluid/framework/parallel_executor.cc | 26 +- paddle/fluid/framework/phi_utils.cc | 2 +- paddle/fluid/framework/phi_utils.h | 2 +- paddle/fluid/framework/pull_dense_worker.cc | 14 +- paddle/fluid/framework/section_worker.cc | 2 +- paddle/fluid/framework/tensor_test.cc | 10 +- paddle/fluid/framework/tensor_util.cc | 14 +- paddle/fluid/framework/tensor_util.h | 8 +- paddle/fluid/framework/tensor_util_test.cc | 10 +- paddle/fluid/framework/trainer.h | 6 +- paddle/fluid/framework/var_type_traits.h | 4 +- paddle/fluid/imperative/amp_auto_cast.cc | 2 +- .../fluid/imperative/gradient_accumulator.cc | 18 +- paddle/fluid/imperative/prepared_operator.cc | 6 +- paddle/fluid/imperative/tracer.cc | 6 +- .../ir_params_sync_among_devices_pass.cc | 4 +- .../ir_params_sync_among_devices_pass.h | 2 +- paddle/fluid/inference/api/analysis_config.cc | 10 +- .../fluid/inference/api/analysis_predictor.cc | 28 +- .../fluid/inference/api/analysis_predictor.h | 2 +- paddle/fluid/inference/api/api_impl.cc | 2 +- .../inference/api/details/zero_copy_tensor.cc | 14 +- paddle/fluid/inference/api/infer_context.cc | 2 +- paddle/fluid/inference/api/infer_context.h | 2 +- .../fluid/inference/api/resource_manager.cc | 15 +- paddle/fluid/inference/api/resource_manager.h | 6 +- paddle/fluid/inference/lite/tensor_utils.cc | 2 +- .../tensorrt/plugin/qkv_to_context_plugin.cu | 5 +- paddle/fluid/memory/allocation/CMakeLists.txt | 2 +- .../memory/allocation/allocator_facade.cc | 28 +- .../memory/allocation/allocator_facade.h | 2 +- .../allocator_facade_abs_flags_test.cc | 6 +- .../allocator_facade_frac_flags_test.cc | 6 +- ...o_growth_best_fit_allocator_facade_test.cc | 6 +- .../memory/allocation/buddy_allocator.cc | 6 +- .../memory/allocation/buddy_allocator_test.cc | 4 +- .../fluid/memory/allocation/cuda_allocator.cc | 6 + .../cuda_device_context_allocator.h | 15 +- .../allocation/cuda_managed_allocator.cc | 5 + .../allocation/naive_best_fit_allocator.cc | 28 +- .../naive_best_fit_allocator_test.cc | 2 +- .../memory/allocation/pinned_allocator.cc | 8 +- .../memory/allocation/retry_allocator_test.cc | 4 +- .../memory/allocation/system_allocator.cc | 8 +- .../memory/allocation/system_allocator.h | 2 +- .../allocation/system_allocator_test.cc | 2 +- paddle/fluid/memory/malloc.cc | 2 +- paddle/fluid/memory/malloc.h | 2 +- paddle/fluid/memory/memcpy.cc | 78 ++++- paddle/fluid/memory/memory_stats_test.cc | 2 +- .../fluid/operators/array_to_lod_tensor_op.cc | 2 +- .../fluid/operators/class_center_sample_op.cu | 14 +- .../collective/c_sync_calc_stream_op.h | 2 +- .../operators/collective/c_wait_comm_op.cc | 5 +- .../operators/collective/c_wait_compute_op.cc | 5 +- .../controlflow/conditional_block_op.h | 2 +- paddle/fluid/operators/controlflow/feed_op.cc | 2 +- .../operators/controlflow/get_places_op.cc | 4 +- .../operators/controlflow/while_op_helper.cc | 2 +- .../operators/detection/target_assign_op.h | 8 +- paddle/fluid/operators/dgc_op.h | 2 +- paddle/fluid/operators/expand_as_op.cc | 2 +- paddle/fluid/operators/expand_op.cc | 2 +- paddle/fluid/operators/fake_quantize_op.cu.h | 4 +- .../fused_embedding_eltwise_layernorm_op.cu | 17 +- .../fused_softmax_mask_upper_triangle_op.cu | 4 + .../get_tensor_from_selected_rows_op.cc | 2 +- .../fluid/operators/graph_khop_sampler_op.cu | 17 +- paddle/fluid/operators/hinge_loss_op.cc | 2 +- paddle/fluid/operators/im2sequence_op.cc | 2 +- paddle/fluid/operators/isfinite_op.h | 8 +- paddle/fluid/operators/l1_norm_op.cc | 2 +- paddle/fluid/operators/load_op.cc | 2 +- .../fluid/operators/lod_tensor_to_array_op.cc | 2 +- paddle/fluid/operators/lookup_table_v2_op.cu | 5 +- .../operators/margin_cross_entropy_op.cu | 4 +- .../operators/math/bert_encoder_functor.h | 2 +- paddle/fluid/operators/math/prelu.h | 2 +- paddle/fluid/operators/math/sample_prob.h | 2 +- paddle/fluid/operators/matmul_op.cc | 2 +- paddle/fluid/operators/memcpy_h2d_op.h | 2 +- paddle/fluid/operators/merge_lod_tensor_op.cc | 2 +- paddle/fluid/operators/minus_op.cc | 2 +- paddle/fluid/operators/nop_op.cc | 2 +- .../fluid/operators/pad_constant_like_op.cc | 2 +- .../operators/pscore/send_and_recv_op.cc | 2 +- paddle/fluid/operators/random_crop_op.h | 4 +- paddle/fluid/operators/rank_loss_op.cc | 2 +- .../fluid/operators/reader/buffered_reader.cc | 4 +- .../fluid/operators/reader/buffered_reader.h | 4 +- paddle/fluid/operators/reshape_op.cc | 8 +- paddle/fluid/operators/save_op.cc | 2 +- paddle/fluid/operators/select_op_helper.h | 2 +- .../sequence_ops/sequence_reverse_op.h | 4 +- .../sequence_ops/sequence_softmax_op.cc | 2 +- paddle/fluid/operators/shuffle_batch_op.cu | 2 +- paddle/fluid/operators/split_lod_tensor_op.cc | 2 +- paddle/fluid/operators/sync_batch_norm_op.cu | 50 ++- paddle/fluid/platform/complex_test.cu | 2 +- paddle/fluid/platform/device/device_wrapper.h | 2 +- paddle/fluid/platform/device/gpu/gpu_dnn.h | 2 +- paddle/fluid/platform/device/gpu/gpu_helper.h | 2 +- paddle/fluid/platform/device/gpu/gpu_info.cc | 7 + paddle/fluid/platform/device/gpu/gpu_info.h | 2 +- .../platform/device/gpu/gpu_launch_config.h | 2 +- .../platform/device/gpu/gpu_resource_pool.cc | 2 +- .../platform/device/gpu/gpu_resource_pool.h | 2 +- paddle/fluid/platform/device/gpu/gpu_types.h | 12 +- paddle/fluid/platform/device_code_test.cc | 2 +- paddle/fluid/platform/device_context.cc | 10 +- paddle/fluid/platform/device_context.h | 4 +- paddle/fluid/platform/device_event.h | 2 +- paddle/fluid/platform/device_event_gpu.cc | 2 +- paddle/fluid/platform/enforce.h | 2 +- paddle/fluid/platform/enforce_test.cc | 2 +- paddle/fluid/platform/init.cc | 10 +- paddle/fluid/platform/init_test.cc | 2 +- paddle/fluid/platform/place.h | 4 +- paddle/fluid/platform/profiler.cc | 2 +- paddle/fluid/platform/profiler.h | 4 +- .../platform/profiler/chrometracing_logger.cc | 2 +- .../platform/profiler/chrometracing_logger.h | 2 +- .../profiler/dump/deserialization_reader.cc | 4 +- .../profiler/dump/deserialization_reader.h | 2 +- .../profiler/dump/serialization_logger.cc | 2 +- .../profiler/dump/serialization_logger.h | 2 +- .../fluid/platform/profiler/event_python.cc | 6 +- paddle/fluid/platform/profiler/event_python.h | 6 +- paddle/fluid/platform/profiler/profiler.cc | 4 +- paddle/fluid/platform/profiler_helper.h | 4 +- paddle/fluid/platform/profiler_test.cc | 2 +- paddle/fluid/pybind/cuda_streams_py.cc | 20 +- paddle/fluid/pybind/cuda_streams_py.h | 4 +- paddle/fluid/pybind/eager_functions.cc | 2 +- paddle/fluid/pybind/eager_math_op_patch.cc | 2 +- paddle/fluid/pybind/eager_method.cc | 4 +- paddle/fluid/pybind/generator_py.cc | 2 +- paddle/fluid/pybind/inference_api.cc | 12 +- paddle/fluid/pybind/parallel_executor.cc | 2 +- paddle/fluid/pybind/place.cc | 16 +- paddle/fluid/pybind/process_group_utils.h | 4 +- paddle/fluid/pybind/pybind.cc | 12 +- paddle/fluid/pybind/tensor.cc | 2 +- paddle/fluid/pybind/tensor_py.h | 12 +- paddle/phi/CMakeLists.txt | 3 + paddle/phi/api/include/context_pool.h | 2 +- paddle/phi/api/include/tensor.h | 2 +- paddle/phi/api/lib/context_pool.cc | 4 +- paddle/phi/api/lib/data_transform.cc | 6 +- paddle/phi/api/lib/tensor.cc | 2 +- paddle/phi/api/lib/tensor_utils.cc | 4 +- paddle/phi/api/profiler/event.h | 8 +- paddle/phi/backends/CMakeLists.txt | 6 +- paddle/phi/backends/context_pool.cc | 2 +- paddle/phi/backends/context_pool.h | 4 +- paddle/phi/backends/device_code.cc | 6 +- paddle/phi/backends/device_code.h | 2 +- paddle/phi/backends/device_memory_aligment.h | 2 +- paddle/phi/backends/gpu/gpu_context.cc | 2 +- paddle/phi/backends/gpu/gpu_context.h | 6 +- paddle/phi/backends/gpu/gpu_device_function.h | 2 +- paddle/phi/backends/gpu/gpu_dnn.h | 2 +- paddle/phi/backends/gpu/gpu_helper.h | 2 +- paddle/phi/backends/gpu/gpu_info.h | 2 +- paddle/phi/backends/gpu/gpu_launch_config.h | 2 +- paddle/phi/backends/gpu/gpu_types.h | 4 +- paddle/phi/backends/gpu/musa/musa_info.cc | 329 ++++++++++++++++++ paddle/phi/capi/lib/c_device_context.cc | 2 +- paddle/phi/capi/lib/c_kernel_context.cc | 2 +- paddle/phi/common/backend.h | 2 +- paddle/phi/common/complex.h | 4 +- paddle/phi/common/float16.h | 2 +- paddle/phi/common/memory_utils.cc | 2 +- paddle/phi/common/memory_utils.h | 6 +- paddle/phi/common/place.cc | 4 +- paddle/phi/core/compat/convert_utils.cc | 6 +- paddle/phi/core/cuda_stream.h | 18 + paddle/phi/core/enforce.h | 2 +- paddle/phi/core/flags.cc | 16 +- paddle/phi/core/generator.cc | 4 +- paddle/phi/core/kernel_factory.cc | 4 +- paddle/phi/core/kernel_registry.h | 4 +- paddle/phi/core/kernel_utils.h | 2 +- paddle/phi/core/mixed_vector.cc | 4 +- paddle/phi/core/string_tensor.cc | 2 +- paddle/phi/core/tensor_utils.cc | 16 +- paddle/phi/core/utils/type_info.cc | 4 +- paddle/phi/core/utils/visit_place.h | 4 +- paddle/phi/infermeta/multiary.cc | 2 +- paddle/phi/kernels/CMakeLists.txt | 2 +- paddle/phi/kernels/activation_kernel.cc | 2 +- paddle/phi/kernels/assign_kernel.cc | 2 +- .../kernels/check_memory_continue_kernel.cc | 2 +- paddle/phi/kernels/dist_grad_kernel.cc | 2 +- paddle/phi/kernels/empty_kernel.cc | 2 +- paddle/phi/kernels/flatten_grad_kernel.cc | 2 +- paddle/phi/kernels/flatten_kernel.cc | 2 +- paddle/phi/kernels/full_kernel.cc | 2 +- paddle/phi/kernels/funcs/CMakeLists.txt | 2 +- paddle/phi/kernels/funcs/blas/blas.h | 4 +- .../phi/kernels/funcs/detail/strided_memcpy.h | 6 +- paddle/phi/kernels/funcs/layer_norm_util.h | 4 +- paddle/phi/kernels/funcs/math_function.cc | 2 +- paddle/phi/kernels/funcs/math_function.h | 2 +- paddle/phi/kernels/funcs/pooling.h | 6 +- paddle/phi/kernels/funcs/select_impl.cu.h | 2 +- paddle/phi/kernels/funcs/softmax.h | 2 +- paddle/phi/kernels/funcs/strided_memcpy.h | 2 +- .../fusion/gpu/fused_softmax_mask_utils.h | 2 +- paddle/phi/kernels/gpu/batch_norm_kernel.cu | 2 +- paddle/phi/kernels/gpu/reduce.h | 2 +- paddle/phi/kernels/gpu/reduce_grad.h | 2 +- paddle/phi/kernels/group_norm_kernel.h | 2 +- .../kernels/impl/segment_pool_kernel_impl.h | 2 +- paddle/phi/kernels/impl/warpctc_kernel_impl.h | 2 +- .../phi/kernels/impl/warprnnt_kernel_impl.h | 4 +- paddle/phi/kernels/is_empty_kernel.cc | 2 +- paddle/phi/kernels/kps/elementwise_kernel.cu | 2 +- paddle/phi/kernels/layer_norm_kernel.h | 2 +- paddle/phi/kernels/memcpy_kernel.cc | 4 +- paddle/phi/kernels/npu_identity_kernel.cc | 2 +- paddle/phi/kernels/prod_kernel.cc | 2 +- paddle/phi/kernels/reduce_all_kernel.cc | 2 +- paddle/phi/kernels/reduce_amax_kernel.cc | 2 +- paddle/phi/kernels/reduce_amin_kernel.cc | 2 +- paddle/phi/kernels/reduce_any_kernel.cc | 2 +- paddle/phi/kernels/reduce_mean_kernel.cc | 2 +- paddle/phi/kernels/reduce_sum_kernel.cc | 2 +- paddle/phi/kernels/reverse_kernel.cc | 2 +- .../selected_rows/activation_kernel.cc | 2 +- .../kernels/selected_rows/assign_kernel.cc | 2 +- .../elementwise_multiply_kernel.cc | 2 +- .../phi/kernels/selected_rows/full_kernel.cc | 4 +- .../kernels/selected_rows/isfinite_kernel.cc | 4 +- .../merge_selected_rows_kernel.cc | 2 +- .../phi/kernels/selected_rows/scale_kernel.cc | 2 +- .../phi/kernels/selected_rows/shape_kernel.cc | 2 +- .../kernels/selected_rows/uniform_kernel.cc | 2 +- paddle/phi/kernels/shape_kernel.cc | 2 +- paddle/phi/kernels/sparse/empty_kernel.cc | 2 +- .../sparse/sparse_utils_grad_kernel.cc | 2 +- paddle/phi/kernels/squeeze_grad_kernel.cc | 2 +- paddle/phi/kernels/squeeze_kernel.cc | 2 +- .../phi/kernels/strided_slice_grad_kernel.cc | 2 +- paddle/phi/kernels/strided_slice_kernel.cc | 2 +- paddle/phi/kernels/strings/gpu/copy_utils.h | 4 +- .../kernels/strings/strings_empty_kernel.cc | 2 +- paddle/phi/kernels/strings/unicode.cc | 2 +- paddle/phi/kernels/strings/unicode.h | 2 +- paddle/phi/kernels/transfer_layout_kernel.cc | 4 +- paddle/phi/kernels/unsqueeze_grad_kernel.cc | 2 +- paddle/phi/kernels/unsqueeze_kernel.cc | 2 +- paddle/testing/paddle_gtest_main.cc | 4 +- 312 files changed, 1459 insertions(+), 614 deletions(-) create mode 100644 cmake/mccl.cmake create mode 100644 cmake/mudnn.cmake create mode 100644 cmake/musa.cmake create mode 100644 paddle/phi/backends/gpu/musa/musa_info.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 632cf33100c7e..2f05a7eb080fa 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -56,6 +56,7 @@ option(WITH_XPU_XFT "Compile PaddlePaddle with BAIDU XPU-XFT" OFF) option(WITH_XPU_PLUGIN "Compile PaddlePaddle with BAIDU XPU plugin" OFF) option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode" OFF) option(WITH_ROCM "Compile PaddlePaddle with ROCM platform" OFF) +option(WITH_MUSA "Compile PaddlePaddle with MUSA platform" OFF) option(WITH_IPU "Compile PaddlePaddle with Graphcore IPU" OFF) option(WITH_ONNXRUNTIME "Compile PaddlePaddle with ONNXRUNTIME" OFF) option(WITH_CUSPARSELT "Compile PaddlePaddle with CUSPARSELT" OFF) @@ -89,6 +90,9 @@ endif() if(WITH_GPU AND WITH_ROCM) message(FATAL_ERROR "Error when compile CUDA and ROCM at the same time") endif() +if(WITH_GPU AND WITH_MUSA) + message(FATAL_ERROR "Error when compile CUDA and MUSA at the same time") +endif() if(WITH_GPU AND NOT APPLE) enable_language(CUDA) @@ -346,6 +350,7 @@ if(LINUX AND NOT WITH_CUSTOM_DEVICE AND NOT WITH_GPU AND NOT WITH_ROCM + AND NOT WITH_MUSA AND NOT WITH_XPU AND NOT WITH_XPU_KP AND NOT WITH_XPU_XFT @@ -503,6 +508,31 @@ else() endif() endif() +if(WITH_MUSA) + include(musa) + include(mudnn) +endif() + +if(NOT WITH_MUSA AND WITH_MCCL) + message( + WARNING "Disable MCCL when compiling without MUSA. Force WITH_MCCL=OFF.") + set(WITH_MCCL + OFF + CACHE STRING "Disable MCCL when compiling without MUSA" FORCE) +endif() + +if(WITH_MCCL) + add_definitions("-DPADDLE_WITH_MCCL") + include(mccl) +else() + if(WITH_MUSA) + message( + WARNING + "If the environment is multi-card, the WITH_MCCL option needs to be turned on, otherwise only a single card can be used." + ) + endif() +endif() + if(WITH_HETERPS AND WITH_PSLIB) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0") endif() @@ -733,6 +763,7 @@ if(WITH_CPP_DIST) endif() endif() +include_directories(/usr/lib/llvm-11/include/openmp/) add_subdirectory(paddle) if(WITH_PYTHON) add_subdirectory(python) diff --git a/cmake/flags.cmake b/cmake/flags.cmake index a32dea08e5bff..3e95ed25ce473 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -141,15 +141,11 @@ if(NOT WIN32) set(COMMON_FLAGS -fPIC -fno-omit-frame-pointer - -Werror - -Wall - -Wextra -Wno-unused-parameter -Wno-unused-function -Wno-error=array-bounds #Warning in Eigen, gcc 12.2 -Wno-error=ignored-attributes # Warnings in Eigen, gcc 6.3 -Wno-error=int-in-bool-context # Warning in Eigen gcc 7.2 - -Wimplicit-fallthrough=0 # Warning in tinyformat.h ${fsanitize}) if(WITH_IPU) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 947d44950d52b..28aecb580a637 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -425,6 +425,9 @@ function(cc_binary TARGET_NAME) if(WITH_ROCM) target_link_libraries(${TARGET_NAME} ${ROCM_HIPRTC_LIB}) endif() + #if(WITH_MUSA) + # target_link_libraries(${TARGET_NAME} ${MUSA_LIB}) + #endif() check_coverage_opt(${TARGET_NAME} ${cc_binary_SRCS}) @@ -775,6 +778,71 @@ function(hip_test TARGET_NAME) endif() endfunction() +function(musa_library TARGET_NAME) + if(WITH_MUSA) + set(options STATIC static SHARED shared) + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(musa_library "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) + if(musa_library_SRCS) + # TODO(MTAI): enable compiling static library + #if(musa_library_SHARED OR musa_library_shared) # build *.so + # musa_add_library(${TARGET_NAME} SHARED ${musa_library_SRCS}) + #else() + # musa_add_library(${TARGET_NAME} STATIC ${musa_library_SRCS}) + # find_fluid_modules(${TARGET_NAME}) + # find_phi_modules(${TARGET_NAME}) + #endif() + musa_add_library(${TARGET_NAME} SHARED ${hip_library_SRCS}) + if(musa_library_DEPS) + add_dependencies(${TARGET_NAME} ${musa_library_DEPS}) + target_link_libraries(${TARGET_NAME} ${musa_library_DEPS}) + endif() + # cpplint code style + foreach(source_file ${musa_library_SRCS}) + string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file}) + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) + list(APPEND musa_library_HEADERS + ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) + endif() + endforeach() + else() + if(musa_library_DEPS) + list(REMOVE_DUPLICATES musa_library_DEPS) + generate_dummy_static_lib( + LIB_NAME ${TARGET_NAME} FILE_PATH ${target_SRCS} GENERATOR + "generic.cmake:musa_library") + + target_link_libraries(${TARGET_NAME} ${musa_library_DEPS}) + add_dependencies(${TARGET_NAME} ${musa_library_DEPS}) + else() + message(FATAL "Please specify source file or library in musa_library.") + endif() + endif() + endif() +endfunction() + +function(musa_binary TARGET_NAME) + if(WITH_MUSA) + set(options "") + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(musa_binary "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) + musa_add_executable(${TARGET_NAME} ${musa_binary_SRCS}) + if(musa_binary_DEPS) + target_link_libraries(${TARGET_NAME} ${musa_binary_DEPS}) + add_dependencies(${TARGET_NAME} ${musa_binary_DEPS}) + common_link(${TARGET_NAME}) + endif() + endif() +endfunction() + +# TODO(MTAI): enable musa_test +#function(musa_test TARGET_NAME) +#endfunction() + function(xpu_library TARGET_NAME) if(WITH_XPU_KP) set(options STATIC static SHARED shared) diff --git a/cmake/mccl.cmake b/cmake/mccl.cmake new file mode 100644 index 0000000000000..12191a2711d46 --- /dev/null +++ b/cmake/mccl.cmake @@ -0,0 +1,31 @@ +if(NOT WITH_MUSA) + return() +endif() + +# Now we don't support MCCL on windows +if(WIN32) + return() +endif() + +# FIXME(MTAI): please make sure that we can find MCCL successfully +if(WITH_MCCL) + set(MCCL_ROOT + ${MUSA_PATH}/mccl + CACHE PATH "MCCL ROOT") + find_path( + MCCL_INCLUDE_DIR mccl.h + PATHS ${MCCL_ROOT} ${MCCL_ROOT}/include ${MCCL_ROOT}/local/include + $ENV{MCCL_ROOT} $ENV{MCCL_ROOT}/include $ENV{MCCL_ROOT}/local/include + NO_DEFAULT_PATH) + + file(READ ${MCCL_INCLUDE_DIR}/mccl.h MCCL_VERSION_FILE_CONTENTS) + + string(REGEX MATCH "define NCCL_VERSION_CODE +([0-9]+)" MCCL_VERSION + "${MCCL_VERSION_FILE_CONTENTS}") + string(REGEX REPLACE "define NCCL_VERSION_CODE +([0-9]+)" "\\1" MCCL_VERSION + "${MCCL_VERSION}") + + message(STATUS "Current MCCL header is ${MCCL_INCLUDE_DIR}/mccl.h. " + "Current MCCL version is v${MCCL_VERSION}. ") +endif() + diff --git a/cmake/mudnn.cmake b/cmake/mudnn.cmake new file mode 100644 index 0000000000000..80c74c9131c21 --- /dev/null +++ b/cmake/mudnn.cmake @@ -0,0 +1,66 @@ +if(NOT WITH_MUSA) + return() +endif() + +if(WIN32) + return() +endif() + +find_path( + MUDNN_INCLUDE_DIR mudnn.h + PATHS ${MUDNN_ROOT} ${MUDNN_ROOT}/include $ENV{MUDNN_ROOT} + $ENV{MUDNN_ROOT}/include ${MUSA_TOOLKIT_INCLUDE} + NO_DEFAULT_PATH) + +get_filename_component(__libpath_hist ${MUSA_MUSART_LIBRARY} PATH) + +set(TARGET_ARCH "x86_64") +if(NOT ${CMAKE_SYSTEM_PROCESSOR}) + set(TARGET_ARCH ${CMAKE_SYSTEM_PROCESSOR}) +endif() + +list( + APPEND + MUDNN_CHECK_LIBRARY_DIRS + ${MUDNN_ROOT} + ${MUDNN_ROOT}/lib64 + ${MUDNN_ROOT}/lib + ${MUDNN_ROOT}/lib/x64 + ${MUDNN_ROOT}/lib/${TARGET_ARCH}-linux-gnu + ${MUDNN_ROOT}/local/cuda-${MUSA_VERSION}/targets/${TARGET_ARCH}-linux/lib/ + $ENV{MUDNN_ROOT} + $ENV{MUDNN_ROOT}/lib64 + $ENV{MUDNN_ROOT}/lib + $ENV{MUDNN_ROOT}/lib/x64 + /usr/lib + ${MUSA_TOOLKIT_ROOT_DIR} + ${MUSA_TOOLKIT_ROOT_DIR}/lib/x64) +set(MUDNN_LIB_NAME "") + +if(LINUX) + set(MUDNN_LIB_NAME "libmudnn.so") +endif() + +find_library( + MUDNN_LIBRARY + NAMES ${MUDNN_LIB_NAME} + PATHS ${MUDNN_CHECK_LIBRARY_DIRS} ${MUDNN_INCLUDE_DIR} ${__libpath_hist} + NO_DEFAULT_PATH + DOC "Path to muDNN library.") + +if(MUDNN_INCLUDE_DIR AND MUDNN_LIBRARY) + set(MUDNN_FOUND ON) +else() + set(MUDNN_FOUND OFF) +endif() + +#macro(find_cudnn_version cudnn_header_file) +#endmacro() + +#if(MUDNN_FOUND) +# find_mudnn_version(${MUDNN_INCLUDE_DIR}/mudnn.h) +# if(NOT MUDNN_MAJOR_VERSION) +# find_mudnn_version(${MUDNN_INCLUDE_DIR}/mudnn_version.h) +# endif() +#endif() + diff --git a/cmake/musa.cmake b/cmake/musa.cmake new file mode 100644 index 0000000000000..39245d726d4f9 --- /dev/null +++ b/cmake/musa.cmake @@ -0,0 +1,33 @@ +if(NOT WITH_MUSA) + return() +endif() + +if(NOT DEFINED ENV{MUSA_PATH}) + set(MUSA_PATH + "/usr/local/musa" + CACHE PATH "Path to which ROCm has been installed") +else() + set(MUSA_PATH + $ENV{MUSA_PATH} + CACHE PATH "Path to which ROCm has been installed") +endif() +set(CMAKE_MODULE_PATH "${MUSA_PATH}/cmake" ${CMAKE_MODULE_PATH}) + +find_package(MUSA REQUIRED) +include_directories(${MUSA_PATH}/include) + +#macro(find_musa_version version_file) +#endmacro() +#find_musa_version(${MUSA_PATH}/version.h) + +if(WITH_CINN) + list(APPEND MUSA_MCC_FLAGS -std=c++14) +else() + list(APPEND MUSA_MCC_FLAGS -std=c++17) +endif() + +set(MUSA_VERBOSE_BUILD ON) +if(CMAKE_BUILD_TYPE MATCHES Debug) + list(APPEND MUSA_MCC_FLAGS -g2) + list(APPEND MUSA_MCC_FLAGS -O0) +endif() diff --git a/paddle/fluid/distributed/fleet_executor/carrier.cc b/paddle/fluid/distributed/fleet_executor/carrier.cc index 82d99a3835230..dc92bb8f699d6 100644 --- a/paddle/fluid/distributed/fleet_executor/carrier.cc +++ b/paddle/fluid/distributed/fleet_executor/carrier.cc @@ -272,7 +272,7 @@ static std::shared_ptr GetGC( int64_t max_memory_size = framework::GetEagerDeletionThreshold(); std::shared_ptr gc; if (max_memory_size >= 0) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (platform::is_gpu_place(place)) { if (framework::IsFastEagerDeletionModeEnabled()) { gc.reset(new framework::UnsafeFastGPUGarbageCollector(place, diff --git a/paddle/fluid/distributed/fleet_executor/cond_interceptor.cc b/paddle/fluid/distributed/fleet_executor/cond_interceptor.cc index 2e3389af5feb5..df284822390d0 100644 --- a/paddle/fluid/distributed/fleet_executor/cond_interceptor.cc +++ b/paddle/fluid/distributed/fleet_executor/cond_interceptor.cc @@ -71,7 +71,7 @@ bool CondInterceptor::GetCondResult() { const auto& cond_tensor = cond_var->Get(); bool res = false; if (platform::is_gpu_place(cond_tensor.place())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) phi::DenseTensor cpu_tensor; framework::TensorCopy(cond_tensor, platform::CPUPlace(), &cpu_tensor); platform::DeviceContextPool::Instance().Get(cond_tensor.place())->Wait(); diff --git a/paddle/fluid/distributed/fleet_executor/dist_model.cc b/paddle/fluid/distributed/fleet_executor/dist_model.cc index 4836d656d180f..4328941d60a65 100644 --- a/paddle/fluid/distributed/fleet_executor/dist_model.cc +++ b/paddle/fluid/distributed/fleet_executor/dist_model.cc @@ -76,7 +76,7 @@ bool LoadDataFromDistModelTensor(const DistModelTensor &input_data, input_data.data.length()); } else if (platform::is_gpu_place(place)) { VLOG(3) << "Loading data for GPU."; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto *dev_ctx = dynamic_cast(pool.Get(place)); auto gpu_place = place; diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc index 7567236c4ff68..7050947466d23 100644 --- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc +++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc @@ -132,7 +132,7 @@ void ScaleAPI(const paddle::Tensor& x, bias_after_scale, dense_out.get()); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) } else if (expected_kernel_place == paddle::platform::CUDAPlace()) { auto* dev_ctx = dynamic_cast(pool.Get(expected_kernel_place)); diff --git a/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py index 7fe53febc5a9b..b96b997976be4 100644 --- a/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py +++ b/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py @@ -124,7 +124,7 @@ def FindParsingFunctionFromAttributeType(atype): FUNCTION_SET_DEVICE_TEMPLATE = """{} SetPythonStack(); if (paddle::platform::is_gpu_place(place)) {{ -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) phi::backends::gpu::SetDeviceId(place.device); VLOG(4) <<"CurrentDeviceId: " << phi::backends::gpu::GetCurrentDeviceId() << " from " << (int)place.device; #else diff --git a/paddle/fluid/eager/nan_inf_utils.cc b/paddle/fluid/eager/nan_inf_utils.cc index a66bc211d513c..e3e5968426462 100644 --- a/paddle/fluid/eager/nan_inf_utils.cc +++ b/paddle/fluid/eager/nan_inf_utils.cc @@ -98,7 +98,7 @@ void CheckTensorHasNanOrInf(const std::string& api_name, const Tensor& tensor) { auto& place = dense_tensor->place(); if (paddle::platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) paddle::framework::details::tensor_check( api_name, tensor_name, *dense_tensor, place); #else diff --git a/paddle/fluid/framework/conv_search_cache.h b/paddle/fluid/framework/conv_search_cache.h index 1620c99ce8560..cbac8cac4e543 100644 --- a/paddle/fluid/framework/conv_search_cache.h +++ b/paddle/fluid/framework/conv_search_cache.h @@ -32,7 +32,7 @@ class ConvSearchCache { static ConvSearchCache instance; return instance; } -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) AlgorithmsCache* GetForward() { return &forward_cache_; } @@ -45,6 +45,8 @@ class ConvSearchCache { AlgorithmsCache* GetConvFusion() { return &fusion_forward_cache_; } +#elif defined(PADDLE_WITH_MUSA) + #else AlgorithmsCache* GetForward() { return &forward_cache_; @@ -67,11 +69,13 @@ class ConvSearchCache { ConvSearchCache(const ConvSearchCache&) {} ConvSearchCache& operator=(const ConvSearchCache&) {} -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) AlgorithmsCache forward_cache_; AlgorithmsCache backward_data_cache_; AlgorithmsCache backward_filter_cache_; AlgorithmsCache fusion_forward_cache_; +#elif defined(PADDLE_WITH_MUSA) + #else AlgorithmsCache forward_cache_; AlgorithmsCache backward_data_cache_; diff --git a/paddle/fluid/framework/copy_same_tensor_test.cc b/paddle/fluid/framework/copy_same_tensor_test.cc index 10e0b76f00459..3bc05d706ade9 100644 --- a/paddle/fluid/framework/copy_same_tensor_test.cc +++ b/paddle/fluid/framework/copy_same_tensor_test.cc @@ -32,7 +32,7 @@ namespace framework { static std::vector CreatePlaceList() { std::vector places; places.emplace_back(platform::CPUPlace()); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) places.emplace_back(platform::CUDAPlace(0)); #endif return places; diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc index ebfed9a6f73f6..64f6214fca0c9 100644 --- a/paddle/fluid/framework/custom_operator.cc +++ b/paddle/fluid/framework/custom_operator.cc @@ -123,7 +123,7 @@ static void RunKernelFunc( "Input tensor (%s) is not initialized.", in_name)); paddle::Tensor custom_in; custom_in.set_impl(std::make_shared(*x)); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (custom_in.is_gpu_pinned()) { VLOG(3) << "Custom Operator: custom input is gpu pinned tensor"; auto gpu_place = phi::GPUPlace(platform::GetCurrentDeviceId()); @@ -1174,7 +1174,7 @@ static void RegisterOperatorKernel( } RegisterOperatorKernelWithPlace( name, op_kernel_func, proto::VarType::RAW, platform::CPUPlace()); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) RegisterOperatorKernelWithPlace( name, op_kernel_func, proto::VarType::RAW, platform::CUDAPlace()); #endif diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc index 32c4845bd0d57..d99e7739e8e39 100644 --- a/paddle/fluid/framework/data_feed.cc +++ b/paddle/fluid/framework/data_feed.cc @@ -1526,7 +1526,7 @@ void MultiSlotInMemoryDataFeed::PutToFeedVec( #endif } -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32) +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && !defined(_WIN32) template void PrivateInstantDataFeed::PutToFeedVec() { for (size_t i = 0; i < use_slots_.size(); ++i) { diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h index 1057640842c2c..875b8ca13da83 100644 --- a/paddle/fluid/framework/data_feed.h +++ b/paddle/fluid/framework/data_feed.h @@ -1951,7 +1951,7 @@ class PaddleBoxDataFeed : public MultiSlotInMemoryDataFeed { int pv_batch_size_; }; -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32) +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && !defined(_WIN32) template class PrivateInstantDataFeed : public DataFeed { public: diff --git a/paddle/fluid/framework/data_feed_factory.cc b/paddle/fluid/framework/data_feed_factory.cc index e058b19469000..368807f72dfc4 100644 --- a/paddle/fluid/framework/data_feed_factory.cc +++ b/paddle/fluid/framework/data_feed_factory.cc @@ -70,7 +70,7 @@ REGISTER_DATAFEED_CLASS(MultiSlotDataFeed); REGISTER_DATAFEED_CLASS(MultiSlotInMemoryDataFeed); REGISTER_DATAFEED_CLASS(PaddleBoxDataFeed); REGISTER_DATAFEED_CLASS(SlotRecordInMemoryDataFeed); -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32) +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && !defined(_WIN32) REGISTER_DATAFEED_CLASS(MultiSlotFileInstantDataFeed); #endif } // namespace framework diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 69f7a49ce55fd..dc66ca6922e35 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -186,7 +186,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { "fuse_relu_depthwise_conv_pass"); AppendPassWithCheck(strategy_.fuse_bn_act_ops_, "fuse_bn_act_pass"); AppendPassWithCheck(strategy_.fuse_bn_add_act_ops_, "fuse_bn_add_act_pass"); -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \ +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && \ !defined(_WIN32) && !defined(__APPLE__) AppendPassWithCheck(strategy_.enable_auto_fusion_, "fusion_group_pass"); #endif @@ -545,7 +545,7 @@ USE_PASS(fused_feedforward_pass); #ifdef PADDLE_WITH_MKLDNN USE_PASS(mkldnn_placement_pass); #endif -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \ +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && \ !defined(_WIN32) && !defined(__APPLE__) USE_PASS(fusion_group_pass); #endif diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc index 4012263f688cb..3e204548fa151 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc @@ -16,7 +16,7 @@ #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h" #include "paddle/fluid/platform/profiler/event_tracing.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/fluid/platform/cuda_device_guard.h" #endif #include @@ -44,15 +44,18 @@ EagerDeletionOpHandle::EagerDeletionOpHandle( place_(place), var_infos_(vars.begin(), vars.end()), gc_(gc) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_MUSA) if (platform::is_gpu_place(place)) { dev_ctx_ = reinterpret_cast( platform::DeviceContextPool::Instance().Get(place)); if (dynamic_cast(gc_)) { platform::CUDADeviceGuard guard(place.device); -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_GPU_SUCCESS( hipEventCreateWithFlags(&event_, hipEventDisableTiming)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaEventCreateWithFlags(&event_, musaEventDisableTiming)); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); @@ -75,12 +78,14 @@ EagerDeletionOpHandle::EagerDeletionOpHandle( } EagerDeletionOpHandle::~EagerDeletionOpHandle() { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (event_) { auto gpu_place = dev_ctx_->GetPlace(); platform::CUDADeviceGuard guard(gpu_place.device); -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event_)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaEventDestroy(event_)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(event_)); #endif @@ -89,7 +94,7 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() { } void EagerDeletionOpHandle::InitCUDA() { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_MUSA) int dev_id = dev_ctxes_.begin()->first.device; events_[dev_id] = nullptr; #endif @@ -177,16 +182,20 @@ void EagerDeletionOpHandle::RunImpl() { void EagerDeletionOpHandle::ClearGarbages( std::deque> *garbages) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_MUSA) if (event_) { auto compute_stream = dev_ctx_->stream(); auto callback_stream = reinterpret_cast(gc_)->stream(); auto callback_func = [=]() { -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, compute_stream)); PADDLE_ENFORCE_GPU_SUCCESS( hipStreamWaitEvent(callback_stream, event_, 0)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event_, compute_stream)); + PADDLE_ENFORCE_GPU_SUCCESS( + musaStreamWaitEvent(callback_stream, event_, 0)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, compute_stream)); PADDLE_ENFORCE_GPU_SUCCESS( @@ -197,7 +206,7 @@ void EagerDeletionOpHandle::ClearGarbages( } else { #endif gc_->Add(std::move(*garbages)); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) } #endif } diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.h b/paddle/fluid/framework/details/eager_deletion_op_handle.h index 0a92269c50ad2..049b0c2ec478b 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.h +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.h @@ -80,7 +80,7 @@ class EagerDeletionOpHandle : public OpHandleBase { std::vector var_infos_; // not own GarbageCollector *gc_; // not own std::vector vars_; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) phi::GPUContext *dev_ctx_{nullptr}; gpuEvent_t event_{nullptr}; #endif diff --git a/paddle/fluid/framework/details/fetch_async_op_handle.cc b/paddle/fluid/framework/details/fetch_async_op_handle.cc index 9fd6a08e02302..d96ab68ec823c 100644 --- a/paddle/fluid/framework/details/fetch_async_op_handle.cc +++ b/paddle/fluid/framework/details/fetch_async_op_handle.cc @@ -135,7 +135,7 @@ static void TransData(const phi::DenseTensor *src_item, const platform::DeviceContext &ctx) { if (src_item->IsInitialized() && src_item->numel() > 0) { if (platform::is_gpu_place(src_item->place())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) TensorCopy(*src_item, platform::CUDAPinnedPlace(), ctx, dst_item); #endif } else { diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc index a36b63da9b8b6..9ea280a8d8bc5 100644 --- a/paddle/fluid/framework/details/fetch_op_handle.cc +++ b/paddle/fluid/framework/details/fetch_op_handle.cc @@ -121,7 +121,7 @@ static void TransData(const phi::DenseTensor &src_item, phi::DenseTensor *dst_item) { if (src_item.IsInitialized() && src_item.numel() > 0) { if (platform::is_gpu_place(src_item.place())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) TensorCopy(src_item, platform::CPUPlace(), dst_item); #endif } else { diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc index 29d5697b23f0d..b07211a6b18d7 100644 --- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc @@ -32,7 +32,7 @@ typedef std::vector< std::vector>> GradientAndLoDTensor; -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) FusedAllReduceOpHandle::FusedAllReduceOpHandle( ir::Node *node, const std::vector &local_scopes, @@ -61,11 +61,13 @@ FusedAllReduceOpHandle::FusedAllReduceOpHandle( #endif FusedAllReduceOpHandle::~FusedAllReduceOpHandle() { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) auto destroy_event = [](gpuEvent_t event) { if (event == nullptr) return; -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaEventDestroy(event)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(event)); #endif @@ -100,9 +102,12 @@ void FusedAllReduceOpHandle::RunImpl() { "when using GPU device.")); auto create_event = [](gpuEvent_t *event) { if (*event) return; -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_GPU_SUCCESS( hipEventCreateWithFlags(event, hipEventDisableTiming)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaEventCreateWithFlags(event, musaEventDisableTiming)); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaEventCreateWithFlags(event, cudaEventDisableTiming)); @@ -122,10 +127,14 @@ void FusedAllReduceOpHandle::RunImpl() { auto flat_nccl_ctxs = nccl_ctxs_->GetFlatCtx(run_order_); auto &nccl_ctx = flat_nccl_ctxs->at(gpu_place.device); nccl_stream = nccl_ctx.stream(); -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(start_event_, compute_stream)); PADDLE_ENFORCE_GPU_SUCCESS( hipStreamWaitEvent(nccl_stream, start_event_, 0)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(start_event_, compute_stream)); + PADDLE_ENFORCE_GPU_SUCCESS( + musaStreamWaitEvent(nccl_stream, start_event_, 0)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(start_event_, compute_stream)); PADDLE_ENFORCE_GPU_SUCCESS( @@ -185,12 +194,16 @@ void FusedAllReduceOpHandle::RunImpl() { FusedAllReduceFunc(in_var_handles, out_var_handles); } -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) if (FLAGS_allreduce_record_one_event) { -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(end_event_, nccl_stream)); PADDLE_ENFORCE_GPU_SUCCESS( hipStreamWaitEvent(compute_stream, end_event_, 0)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(end_event_, nccl_stream)); + PADDLE_ENFORCE_GPU_SUCCESS( + musaStreamWaitEvent(compute_stream, end_event_, 0)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(end_event_, nccl_stream)); PADDLE_ENFORCE_GPU_SUCCESS( diff --git a/paddle/fluid/framework/details/gather_op_handle_test.cc b/paddle/fluid/framework/details/gather_op_handle_test.cc index 3437eb5570dc7..455879f02e833 100644 --- a/paddle/fluid/framework/details/gather_op_handle_test.cc +++ b/paddle/fluid/framework/details/gather_op_handle_test.cc @@ -47,7 +47,7 @@ struct TestGatherOpHandle { void InitCtxOnGpu(bool use_gpu) { if (use_gpu) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) int count = p::GetGPUDeviceCount(); if (count <= 1) { LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA " @@ -224,7 +224,7 @@ TEST(GatherTester, TestCPUGatherTestSelectedRows) { test_op.TestGatherSelectedRows(input_scope_idx); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) TEST(GatherTester, TestGPUGatherTestSelectedRows) { TestGatherOpHandle test_op; diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc index 80c029a5fd976..bc8a31a35a95c 100644 --- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc +++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc @@ -183,7 +183,7 @@ void CheckVarHasNanOrInf(const std::string& op_type, << ", place:" << tensor->place() << ", numel:" << tensor->numel(); if (platform::is_gpu_place(tensor->place())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) tensor_check(op_type, var_name, *tensor, place); #else PADDLE_THROW(platform::errors::PreconditionNotMet( diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index 82f09f51c23e1..69fb0df678f65 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -31,11 +31,13 @@ std::string OpHandleBase::DebugString() const { } OpHandleBase::~OpHandleBase() PADDLE_MAY_THROW { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_MUSA) for (auto &ev : events_) { if (ev.second) { -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(ev.second)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaEventDestroy(ev.second)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(ev.second)); #endif @@ -45,13 +47,16 @@ OpHandleBase::~OpHandleBase() PADDLE_MAY_THROW { } void OpHandleBase::InitCUDA() { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_MUSA) for (auto &p : dev_ctxes_) { int dev_id = p.first.device; platform::SetDeviceId(dev_id); -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_GPU_SUCCESS( hipEventCreateWithFlags(&events_[dev_id], hipEventDisableTiming)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaEventCreateWithFlags(&events_[dev_id], musaEventDisableTiming)); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming)); @@ -136,7 +141,7 @@ void OpHandleBase::InitXPU() { } void OpHandleBase::Run(DeviceType use_device) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_MUSA) if (events_.empty() && use_device == p::kCUDA && dev_ctxes_.size() > 0) { InitCUDA(); } @@ -172,7 +177,7 @@ void OpHandleBase::Run(DeviceType use_device) { } void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_MUSA) PADDLE_ENFORCE_NOT_NULL( waited_ctx, platform::errors::InvalidArgument("Argument waited_ctx is NULL.")); @@ -186,8 +191,10 @@ void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) { } else { auto stream = static_cast(waited_ctx)->stream(); for (auto &ev : events_) { -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(stream, ev.second, 0)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(stream, ev.second, 0)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(stream, ev.second, 0)); #endif @@ -221,12 +228,15 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) { if (in_var_handle) { auto &place = in_var_handle->place(); if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_MUSA) auto stream = static_cast(dev_ctxes_.at(place))->stream(); -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_GPU_SUCCESS( hipStreamWaitEvent(stream, in_var_handle->GetEvent(), 0)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0)); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0)); @@ -248,7 +258,7 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) { if (in_var_handle) { auto &place = in_var_handle->place(); if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_MUSA) platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto stream = @@ -273,13 +283,16 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) { auto *in_var_handle = dynamic_cast(in_var); if (in_var_handle) { if (platform::is_gpu_place(in_var_handle->place())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_MUSA) auto stream = static_cast( dev_ctxes_.at(in_var_handle->place())) ->stream(); -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_GPU_SUCCESS( hipStreamWaitEvent(stream, in_var_handle->GetEvent(), 0)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0)); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0)); @@ -311,15 +324,18 @@ bool OpHandleBase::NeedWait(VarHandleBase *in_var) { void OpHandleBase::RunAndRecordEvent(const std::function &callback) { callback(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_MUSA) if (!events_.empty()) { // Use event for (auto &p : dev_ctxes_) { auto dev_id = p.first.device; auto *cuda_dev_ctx = static_cast(p.second); VLOG(10) << "phi::GPUContext:" << cuda_dev_ctx << ", dev_id:" << dev_id; -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_GPU_SUCCESS( hipEventRecord(events_.at(dev_id), cuda_dev_ctx->stream())); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaEventRecord(events_.at(dev_id), cuda_dev_ctx->stream())); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaEventRecord(events_.at(dev_id), cuda_dev_ctx->stream())); @@ -331,7 +347,7 @@ void OpHandleBase::RunAndRecordEvent(const std::function &callback) { void OpHandleBase::RunAndRecordEvent(platform::Place p, const std::function &callback) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_MUSA) if (platform::is_cpu_place(p) || events_.empty()) { callback(); } else { diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h index 9afe56e4babd4..4bd385ff5099c 100644 --- a/paddle/fluid/framework/details/op_handle_base.h +++ b/paddle/fluid/framework/details/op_handle_base.h @@ -161,7 +161,7 @@ class OpHandleBase { // See https://github.com/PaddlePaddle/Paddle/pull/32283 bool is_variant_scope_ = false; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) std::unordered_map events_; #endif diff --git a/paddle/fluid/framework/details/reduce_op_handle_test.cc b/paddle/fluid/framework/details/reduce_op_handle_test.cc index 7587fb6553cd7..205567a39ecd7 100644 --- a/paddle/fluid/framework/details/reduce_op_handle_test.cc +++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc @@ -303,7 +303,7 @@ TEST(ReduceTester, TestCPUReduceTestLodTensor) { test_op.InitReduceOp(out_scope_idx); test_op.TestReduceLodTensors(out_scope_idx); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) TEST(ReduceTester, TestGPUReduceTestSelectedRows) { TestReduceOpHandle test_op; diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc index 9dac1a7203f8d..8b487b5a0bffb 100644 --- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc +++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc @@ -76,7 +76,7 @@ struct ScaleLossGradFunctor { "Please recompile or reinstall Paddle with XPU support.")); #endif } else { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) OutT cast_coeff = static_cast(coeff_); auto stream = static_cast(ctx_)->stream(); memory::Copy(place_, @@ -110,7 +110,7 @@ void ScaleLossGradOpHandle::RunOnVar(Variable *var, bool record_event) { auto *tensor = var->GetMutable(); tensor->Resize(phi::make_ddim({1})); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) ScaleLossGradFunctor func( coeff_, tensor, place_, out_dtype_, this->dev_ctxes_.at(place_)); if (record_event) { diff --git a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc index 02a68fb697efb..cb16915316ecf 100644 --- a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc +++ b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc @@ -95,7 +95,7 @@ void ShareTensorBufferOpHandle::SetShareDimsAndDtype( } void ShareTensorBufferOpHandle::InitCUDA() { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) int dev_id = dev_ctxes_.begin()->first.device; events_[dev_id] = nullptr; #endif diff --git a/paddle/fluid/framework/details/var_handle.h b/paddle/fluid/framework/details/var_handle.h index a6314220d5c26..9a130bea0d3a2 100644 --- a/paddle/fluid/framework/details/var_handle.h +++ b/paddle/fluid/framework/details/var_handle.h @@ -129,7 +129,7 @@ struct VarHandle : public VarHandleBase { name_(std::move(name)), place_(std::move(place)) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) bool HasEvent() { return has_event_; } const gpuEvent_t& GetEvent() { @@ -154,7 +154,7 @@ struct VarHandle : public VarHandleBase { size_t scope_idx_; std::string name_; platform::Place place_; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) // Only when this event is triggered, var is generated. gpuEvent_t event_; bool has_event_{false}; diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h index 25d29e469a498..1da0aae399c37 100644 --- a/paddle/fluid/framework/device_worker.h +++ b/paddle/fluid/framework/device_worker.h @@ -84,11 +84,11 @@ class PullDenseWorker { public: virtual ~PullDenseWorker() {} virtual void Initialize(const TrainerDesc& param); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) void AddStream(const gpuStream_t stream) { copy_streams_.push_back(stream); } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \ defined(PADDLE_WITH_XPU) void AddPlace(const paddle::platform::Place place) { places_.push_back(place); @@ -154,7 +154,7 @@ class PullDenseWorker { float total_batch_num_ = 0; std::unordered_map scope_to_thread_id_; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) std::vector copy_streams_; #endif std::vector places_; @@ -185,7 +185,7 @@ class DeviceWorker { virtual void ProduceTasks() {} virtual void GetXpuOpIndex() {} virtual void Schedule(int taskid UNUSED) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) virtual void SetStream(const gpuStream_t stream UNUSED) {} virtual void SetEvent(const gpuEvent_t event UNUSED) {} #endif @@ -561,7 +561,7 @@ class PSGPUWorker : public HogwildWorker { new (&program_) ProgramDesc(main_program); } void ProduceTasks() override; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) virtual void SetStream(const gpuStream_t stream) { copy_stream_ = stream; } virtual void SetEvent(const gpuEvent_t event) { event_ = event; } #endif @@ -629,7 +629,7 @@ class PSGPUWorker : public HogwildWorker { std::unordered_map> feasign_set_; paddle::framework::Channel> pull_queue_; paddle::framework::Channel> push_queue_; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) gpuEvent_t event_; gpuStream_t copy_stream_; #endif @@ -802,7 +802,7 @@ class HeterSectionWorker : public DeviceWorker { Scope* GetThreadScope() override { return minibatch_scope_; } // multi-stream - // #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + // #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) // void SetStream(const gpuStream_t stream) override {} // void SetEvent(const gpuEvent_t event) override {} // #endif diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc index 46b917cda740a..e5e8bae0bbd79 100644 --- a/paddle/fluid/framework/dlpack_tensor.cc +++ b/paddle/fluid/framework/dlpack_tensor.cc @@ -96,7 +96,7 @@ struct DLDeviceVisitor { } inline ::DLDevice operator()(const platform::CUDAPlace &place) const { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) ::DLDevice device; device.device_type = kDLGPU; device.device_id = place.device; @@ -108,7 +108,7 @@ struct DLDeviceVisitor { } inline ::DLDevice operator()(const platform::CUDAPinnedPlace &place) const { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) ::DLDevice device; device.device_type = kDLCPUPinned; device.device_id = 0; diff --git a/paddle/fluid/framework/dlpack_tensor_test.cc b/paddle/fluid/framework/dlpack_tensor_test.cc index f6b28b0a22ebc..7bf07aac14127 100644 --- a/paddle/fluid/framework/dlpack_tensor_test.cc +++ b/paddle/fluid/framework/dlpack_tensor_test.cc @@ -108,7 +108,7 @@ void TestToDLManagedTensor(const platform::Place &place, uint16_t lanes) { template void TestMainLoop() { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) std::vector places{platform::CPUPlace(), platform::CUDAPlace(0), platform::CUDAPinnedPlace()}; diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index e0ad2255743c4..40606c4911649 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -492,7 +492,7 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx, std::unique_ptr gc; if (!ctx->force_disable_gc_ && max_memory_size >= 0) { if (platform::is_gpu_place(place_)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (IsFastEagerDeletionModeEnabled()) { gc.reset(new UnsafeFastGPUGarbageCollector(place_, max_memory_size)); } else { diff --git a/paddle/fluid/framework/fleet/box_wrapper.cu b/paddle/fluid/framework/fleet/box_wrapper.cu index 5f46906cf8e82..389b1f99eed72 100644 --- a/paddle/fluid/framework/fleet/box_wrapper.cu +++ b/paddle/fluid/framework/fleet/box_wrapper.cu @@ -156,11 +156,16 @@ void BoxWrapper::CopyForPull(const paddle::platform::Place& place, ->stream(); auto buf_value = memory::Alloc(place, values.size() * sizeof(float*)); float** gpu_values = reinterpret_cast(buf_value->ptr()); -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) hipMemcpy(gpu_values, values.data(), values.size() * sizeof(float*), hipMemcpyHostToDevice); +#elif defined(PADDLE_WITH_MUSA) + musaMemcpy(gpu_values, + values.data(), + values.size() * sizeof(float*), + musaMemcpyHostToDevice); #else cudaMemcpy(gpu_values, values.data(), diff --git a/paddle/fluid/framework/fleet/box_wrapper.h b/paddle/fluid/framework/fleet/box_wrapper.h index 9853c328cd14e..054298795305e 100644 --- a/paddle/fluid/framework/fleet/box_wrapper.h +++ b/paddle/fluid/framework/fleet/box_wrapper.h @@ -593,8 +593,11 @@ class BoxWrapper { auto* gpu_data = gpu_tensor.data(); auto len = gpu_tensor.numel(); data->resize(len); -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) hipMemcpy(data->data(), gpu_data, sizeof(T) * len, hipMemcpyDeviceToHost); +#elif defined(PADDLE_WITH_HIP) + musaMemcpy( + data->data(), gpu_data, sizeof(T) * len, musaMemcpyDeviceToHost); #else cudaMemcpy( data->data(), gpu_data, sizeof(T) * len, cudaMemcpyDeviceToHost); diff --git a/paddle/fluid/framework/fleet/box_wrapper_impl.h b/paddle/fluid/framework/fleet/box_wrapper_impl.h index d72e418aadd3e..09696c824fbbd 100644 --- a/paddle/fluid/framework/fleet/box_wrapper_impl.h +++ b/paddle/fluid/framework/fleet/box_wrapper_impl.h @@ -44,7 +44,7 @@ void BoxWrapper::PullSparseCase(const paddle::platform::Place& place, PADDLE_THROW(platform::errors::Unimplemented( "Warning:: CPUPlace is not supported in PaddleBox now.")); } else if (platform::is_gpu_place(place)) { -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32) +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && !defined(_WIN32) VLOG(3) << "Begin copy keys, key_num[" << total_length << "]"; int device_id = place.GetDeviceId(); phi::DenseTensor& total_keys_tensor = keys_tensor[device_id]; @@ -61,7 +61,7 @@ void BoxWrapper::PullSparseCase(const paddle::platform::Place& place, memory::Alloc(place, slot_lengths.size() * sizeof(int64_t)); uint64_t** gpu_keys = reinterpret_cast(buf_key->ptr()); int64_t* gpu_len = reinterpret_cast(buf_length->ptr()); -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) hipMemcpy(gpu_keys, keys.data(), keys.size() * sizeof(uint64_t*), @@ -70,6 +70,15 @@ void BoxWrapper::PullSparseCase(const paddle::platform::Place& place, slot_lengths_lod.data(), slot_lengths.size() * sizeof(int64_t), hipMemcpyHostToDevice); +#elif defined(PADDLE_WITH_MUSA) + musaMemcpy(gpu_keys, + keys.data(), + keys.size() * sizeof(uint64_t*), + musaMemcpyHostToDevice); + musaMemcpy(gpu_len, + slot_lengths_lod.data(), + slot_lengths.size() * sizeof(int64_t), + musaMemcpyHostToDevice); #else cudaMemcpy(gpu_keys, keys.data(), @@ -153,7 +162,7 @@ void BoxWrapper::PushSparseGradCase( PADDLE_THROW(platform::errors::Unimplemented( "Warning:: CPUPlace is not supported in PaddleBox now.")); } else if (platform::is_gpu_place(place)) { -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32) +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && !defined(_WIN32) int device_id = place.GetDeviceId(); phi::DenseTensor& cached_total_keys_tensor = keys_tensor[device_id]; uint64_t* total_keys = diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc index 05433c1014656..f2c6892c6cd11 100644 --- a/paddle/fluid/framework/fleet/fleet_wrapper.cc +++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc @@ -784,7 +784,7 @@ void FleetWrapper::PushDenseVarsSync( const uint64_t table_id, const std::vector& var_names) {} -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \ +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && \ (defined PADDLE_WITH_PSLIB) void FleetWrapper::PushDenseVarsAsync( const Scope& scope, @@ -813,9 +813,12 @@ void FleetWrapper::PushDenseVarsAsync( g_data, sizeof(float) * count, stream); -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, stream)); hipEventSynchronize(event); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event, stream)); + musaEventSynchronize(event); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, stream)); cudaEventSynchronize(event); diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h index fb5cf91729256..1284b379c9f20 100644 --- a/paddle/fluid/framework/fleet/fleet_wrapper.h +++ b/paddle/fluid/framework/fleet/fleet_wrapper.h @@ -175,7 +175,7 @@ class FleetWrapper { // Push dense variables to server in async mode // Param: scope, table_id, var_names, scale_datanorm, batch_size // Param: push_sparse_status -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) void PushDenseVarsAsync( const Scope& scope, const uint64_t table_id, diff --git a/paddle/fluid/framework/fleet/heter_wrapper.cc b/paddle/fluid/framework/fleet/heter_wrapper.cc index 2cae0721aefa9..761ef1cf8051a 100644 --- a/paddle/fluid/framework/fleet/heter_wrapper.cc +++ b/paddle/fluid/framework/fleet/heter_wrapper.cc @@ -121,7 +121,7 @@ void HeterWrapper::SerializeToReq(const std::string& varname, tensor->numel() * SizeOfType(framework::TransToProtoVarType(tensor->dtype()))); } else { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) memory::Copy(platform::CPUPlace(), data_ptr, tensor->place(), @@ -141,7 +141,7 @@ void HeterWrapper::SerializeToReq(const std::string& varname, } } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) void HeterWrapper::DeSerializeToTensor(Scope* scope, const VariableMessage& req_var, platform::Place place, @@ -169,7 +169,7 @@ void HeterWrapper::DeSerializeToTensor(Scope* scope, void* tensor_data = tensor->mutable_data( place, framework::TransToPhiDataType(ToVarType(req_var.data_type()))); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) memory::Copy(place, tensor_data, platform::CPUPlace(), diff --git a/paddle/fluid/framework/fleet/heter_wrapper.h b/paddle/fluid/framework/fleet/heter_wrapper.h index 77838fbec6d00..70cbce2acc24d 100644 --- a/paddle/fluid/framework/fleet/heter_wrapper.h +++ b/paddle/fluid/framework/fleet/heter_wrapper.h @@ -92,7 +92,7 @@ class HeterWrapper { framework::proto::VarType::Type ToVarType(VariableMessage::Type type); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) void DeSerializeToTensor(Scope* scope, const VariableMessage& req_var, platform::Place place, diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc index 3296679e1eeeb..1d3937ba2b982 100644 --- a/paddle/fluid/framework/garbage_collector.cc +++ b/paddle/fluid/framework/garbage_collector.cc @@ -13,7 +13,7 @@ // limitations under the License. #include -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/fluid/platform/cuda_device_guard.h" #endif #include "gflags/gflags.h" @@ -64,7 +64,7 @@ void IPUGarbageCollector::ClearCallback(const std::function &callback) { } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) UnsafeFastGPUGarbageCollector::UnsafeFastGPUGarbageCollector( const platform::CUDAPlace &place, size_t max_memory_size) : GarbageCollector(place, max_memory_size) {} @@ -91,8 +91,10 @@ StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place, size_t max_memory_size) : GarbageCollector(place, max_memory_size) { platform::CUDADeviceGuard guard(place.device); -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&stream_)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamCreate(&stream_)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&stream_)); callback_manager_.reset( diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h index f3d9ec54e6968..9727654d04c84 100644 --- a/paddle/fluid/framework/garbage_collector.h +++ b/paddle/fluid/framework/garbage_collector.h @@ -85,7 +85,7 @@ class IPUGarbageCollector : public GarbageCollector { }; #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) class UnsafeFastGPUGarbageCollector : public GarbageCollector { public: UnsafeFastGPUGarbageCollector(const platform::CUDAPlace &place, diff --git a/paddle/fluid/framework/ir/cost_model.cc b/paddle/fluid/framework/ir/cost_model.cc index 9ca3190fd092f..9ac931f2501a7 100644 --- a/paddle/fluid/framework/ir/cost_model.cc +++ b/paddle/fluid/framework/ir/cost_model.cc @@ -128,7 +128,7 @@ bool CostData::SetCostData(const ProgramDesc& program, double cpu_time_ms = main_thread_events[op_push_index].CpuElapsedMs( main_thread_events[op_pop_index]); double gpu_time_ms = 0; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) gpu_time_ms = main_thread_events[op_push_index].CudaElapsedMs( main_thread_events[op_pop_index]); #endif @@ -152,7 +152,7 @@ bool CostData::SetCostData(const ProgramDesc& program, double cpu_time_ms = main_thread_events[start_profiler_idx].CpuElapsedMs( main_thread_events[stop_profiler_idx]); double gpu_time_ms = 0; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) gpu_time_ms = main_thread_events[start_profiler_idx].CudaElapsedMs( main_thread_events[stop_profiler_idx]); #endif diff --git a/paddle/fluid/framework/ir/fuse_bn_act_pass.cc b/paddle/fluid/framework/ir/fuse_bn_act_pass.cc index 299e700edb95d..322fcb0f7cf48 100644 --- a/paddle/fluid/framework/ir/fuse_bn_act_pass.cc +++ b/paddle/fluid/framework/ir/fuse_bn_act_pass.cc @@ -34,7 +34,7 @@ namespace framework { namespace ir { void FuseBatchNormActPass::ApplyImpl(ir::Graph *graph) const { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 4, 1) // forward std::unordered_set act_types = {"relu"}; diff --git a/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc b/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc index 506e8721298b6..a218e768ac41d 100644 --- a/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc +++ b/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc @@ -25,7 +25,7 @@ namespace framework { namespace ir { void FuseBatchNormAddActPass::ApplyImpl(ir::Graph *graph) const { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 4, 1) // forward std::unordered_set act_types = {"relu"}; diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc index 06593733e6a27..7ffd09d2474df 100644 --- a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc +++ b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc @@ -27,7 +27,7 @@ namespace phi { class DenseTensor; } // namespace phi -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc index a0f1d9eed0038..b986fc5b37adb 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc +++ b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc @@ -203,7 +203,7 @@ TEST(test_reference_count_pass, test_no_need_buffer_var_shrink) { {}); std::vector use_cuda_list{false}; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) use_cuda_list.push_back(true); #endif for (auto use_cuda : use_cuda_list) { diff --git a/paddle/fluid/framework/new_executor/interpreter/execution_config.cc b/paddle/fluid/framework/new_executor/interpreter/execution_config.cc index 1e6a6f02e2230..aa769089d7fed 100644 --- a/paddle/fluid/framework/new_executor/interpreter/execution_config.cc +++ b/paddle/fluid/framework/new_executor/interpreter/execution_config.cc @@ -53,7 +53,7 @@ inline std::tuple GetThreadPoolConfig(const phi::Place& place, processor_count = std::thread::hardware_concurrency(); if (processor_count) { if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) device_count = phi::backends::gpu::GetGPUDeviceCount(); #endif } diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc index 13896b66f3c55..e70d6fabd5c05 100644 --- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc +++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc @@ -641,7 +641,7 @@ void BuildOpFuncList(const platform::Place& place, *op_with_kernel, *runtime_scope, *dev_ctx, runtime_context); auto expected_kernel_key = framework::TransPhiKernelKeyToOpKernelType( op_with_kernel->GetExpectedKernelType(exec_ctx)); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (op_with_kernel->CanCUDNNBeUsed(exec_ctx, expected_kernel_key.data_type_)) { expected_kernel_key.library_type_ = framework::LibraryType::kCUDNN; diff --git a/paddle/fluid/framework/new_executor/interpreter_base_impl.h b/paddle/fluid/framework/new_executor/interpreter_base_impl.h index 1ae7e5e59ce1f..66a41274cd105 100644 --- a/paddle/fluid/framework/new_executor/interpreter_base_impl.h +++ b/paddle/fluid/framework/new_executor/interpreter_base_impl.h @@ -48,7 +48,7 @@ DECLARE_bool(benchmark); DECLARE_uint64(executor_log_deps_every_microseconds); PHI_DECLARE_bool(new_executor_use_cuda_graph); PHI_DECLARE_bool(enable_new_ir_in_executor); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PHI_DECLARE_bool(sync_nccl_allreduce); #endif diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc index 3b40a3b0727f1..eae90f2a29739 100644 --- a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc +++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc @@ -892,7 +892,7 @@ void NewIRInterpreter::RunOperator(const Instruction& instr_node) { /*For profiling/benchmark only*/ if (FLAGS_benchmark) { instr_node.DeviceContext().Wait(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError()); VLOG(4) << "Operator(" << op->Type() << "): context wait and get last error"; @@ -1245,7 +1245,7 @@ void NewIRInterpreter::RecordStreamForGC(const Instruction& instr) { void NewIRInterpreter::CheckGC(const Instruction& instr) { platform::RecordEvent record( "CheckGC", platform::TracerEventType::UserDefined, 10); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) RecordStreamForGC(instr); #endif auto& var_scope = var_scope_; diff --git a/paddle/fluid/framework/new_executor/profiler.h b/paddle/fluid/framework/new_executor/profiler.h index 95eee77d36288..f2fa9fd50eedb 100644 --- a/paddle/fluid/framework/new_executor/profiler.h +++ b/paddle/fluid/framework/new_executor/profiler.h @@ -42,7 +42,7 @@ class ProfilerGuard { private: void TotalCUDAAllocatedMemorySize(const platform::Place& place) { if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) auto cuda_place = place; cost_info_->device_memory_bytes = platform::RecordedGpuMallocSize(cuda_place.device); diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc index b6c54192a6970..04cbca42c152a 100644 --- a/paddle/fluid/framework/new_executor/program_interpreter.cc +++ b/paddle/fluid/framework/new_executor/program_interpreter.cc @@ -880,7 +880,7 @@ void ProgramInterpreter::RunOperator(const Instruction& instr_node) { /*For profiling/benchmark only*/ if (FLAGS_benchmark) { instr_node.DeviceContext().Wait(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError()); VLOG(4) << "Operator(" << op->Type() << "): context wait and get last error"; @@ -1232,7 +1232,7 @@ void ProgramInterpreter::RecordStreamForGC(const Instruction& instr) { void ProgramInterpreter::CheckGC(const Instruction& instr) { platform::RecordEvent record( "CheckGC", platform::TracerEventType::UserDefined, 10); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) RecordStreamForGC(instr); #endif auto& var_scope = var_scope_; diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index 8cb29a0d5df4c..db535b4fa58de 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -358,7 +358,7 @@ struct OpKernelRegistrarFunctorExCanCUDNNBeUsed(exe_ctx, kernel_type.data_type_)) { auto tmp_kernel_type = kernel_type; tmp_kernel_type.library_type_ = framework::LibraryType::kCUDNN; @@ -1544,7 +1544,7 @@ bool OperatorWithKernel::CanCUDNNBeUsed(const framework::ExecutionContext& ctx, bool use_cudnn = ctx.HasAttr("use_cudnn") && ctx.Attr("use_cudnn") && paddle::platform::is_gpu_place(ctx.GetPlace()); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (use_cudnn) { auto& dev_ctx = ctx.device_context(); use_cudnn &= (dev_ctx.cudnn_handle() != nullptr); @@ -1783,7 +1783,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (this->CanCUDNNBeUsed(exe_ctx, kernel_type_->data_type_)) { kernel_type_->library_type_ = framework::LibraryType::kCUDNN; } @@ -2109,7 +2109,7 @@ OpKernelType OperatorWithKernel::InnerGetExpectedKernelType( } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (this->CanCUDNNBeUsed(ctx, expected_kernel_key.data_type_)) { expected_kernel_key.library_type_ = framework::LibraryType::kCUDNN; } @@ -2132,7 +2132,7 @@ OpKernelType OperatorWithKernel::InnerGetExpectedKernelType( // CPUKernel will be executed and a warning will be given at the same // time. expected_kernel_key.place_ = platform::CPUPlace(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (SupportGPU()) { auto& dev_ctx = ctx.device_context(); expected_kernel_key.place_ = dev_ctx.GetPlace(); diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index e6a2058107b1d..68df442f4a5fa 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -575,7 +575,7 @@ class ExecutionContext : public phi::KernelContext { return device_context_; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) const inline phi::GPUContext& cuda_device_context() const { PADDLE_ENFORCE_EQ(platform::is_gpu_place(device_context_.GetPlace()), true, diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 806b8570108b9..ccf4534bddbb2 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -41,14 +41,14 @@ limitations under the License. */ #include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler/event_tracing.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/fluid/platform/cuda_device_guard.h" #endif #include "paddle/fluid/platform/flags.h" PHI_DECLARE_double(eager_delete_tensor_gb); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PHI_DECLARE_bool(sync_nccl_allreduce); #endif @@ -69,7 +69,7 @@ static std::once_flag gProfileOnce; static bool gProfileStarted = false; #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) std::once_flag p2p_init_flag; #endif @@ -512,7 +512,7 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) { } std::unique_ptr gc; if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (IsFastEagerDeletionModeEnabled()) { gc.reset(new UnsafeFastGPUGarbageCollector(place, max_memory_size)); } else { @@ -621,7 +621,7 @@ bool ParallelExecutor::NeedCreateLocalExeScope() { } void InitP2P(const std::vector &places) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) std::call_once(p2p_init_flag, [&]() { int count = places.size(); if (count <= 1) return; @@ -638,10 +638,14 @@ void InitP2P(const std::vector &places) { for (int j = 0; j < count; ++j) { if (devices[i] == devices[j]) continue; int can_acess = -1; -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) hipError_t ret = hipDeviceCanAccessPeer(&can_acess, devices[i], devices[j]); if (ret != hipSuccess || can_acess != 1) { +#elif defined(PADDLE_WITH_MUSA) + musaError_t ret = + musaDeviceCanAccessPeer(&can_acess, devices[i], devices[j]); + if (ret != musaSuccess || can_acess != 1) { #else cudaError_t ret = cudaDeviceCanAccessPeer(&can_acess, devices[i], devices[j]); @@ -651,8 +655,10 @@ void InitP2P(const std::vector &places) { << " to " << devices[j]; } else { platform::CUDADeviceGuard guard(devices[i]); -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) hipDeviceEnablePeerAccess(devices[j], 0); +#elif defined(PADDLE_WITH_MUSA) + musaDeviceEnablePeerAccess(devices[j], 0); #else cudaDeviceEnablePeerAccess(devices[j], 0); #endif @@ -1299,7 +1305,7 @@ void ParallelExecutor::InitExecutorPrivateMemberInfo( BuildStrategy::ReduceStrategy::kAllReduce; member_->use_all_reduce_ = true; } -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && defined(_WIN32) +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && defined(_WIN32) if (member_->IsUseCUDA(member_->use_device_)) { PADDLE_ENFORCE_EQ( device_count, @@ -1308,7 +1314,7 @@ void ParallelExecutor::InitExecutorPrivateMemberInfo( } #endif -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \ +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && \ (!defined(PADDLE_WITH_NCCL) && !defined(PADDLE_WITH_RCCL)) if (member_->IsUseCUDA(member_->use_device_)) { PADDLE_ENFORCE_EQ( @@ -1674,7 +1680,7 @@ std::vector ParallelExecutor::CreateSSAGraphExecutor( final_graphs = *async_graphs; } else if (member_->build_strategy_.enable_parallel_graph_) { VLOG(3) << "use ParallelSSAGraphExecutor"; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) // TODO(Yancey1989): Remove passing in the main_program when // allreduce_seq_pass doesn't need it as the attr. bool is_inference = details::IsDataParallelInferenceGraph(*graph); diff --git a/paddle/fluid/framework/phi_utils.cc b/paddle/fluid/framework/phi_utils.cc index 9881d479a75a2..4d1bb616c33e2 100644 --- a/paddle/fluid/framework/phi_utils.cc +++ b/paddle/fluid/framework/phi_utils.cc @@ -134,7 +134,7 @@ phi::KernelKey FallBackToCpu(const phi::KernelKey& kernel_key, phi::Backend::CPU, kernel_key.layout(), kernel_key.dtype()); } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (kernel_key.backend() == phi::Backend::GPU || kernel_key.backend() == phi::Backend::GPUDNN) { PADDLE_THROW( diff --git a/paddle/fluid/framework/phi_utils.h b/paddle/fluid/framework/phi_utils.h index f8589e95ff8e9..d5262264aa0cd 100644 --- a/paddle/fluid/framework/phi_utils.h +++ b/paddle/fluid/framework/phi_utils.h @@ -72,7 +72,7 @@ struct ConvertToPhiContext { using TYPE = phi::CPUContext; }; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) template <> struct ConvertToPhiContext { using TYPE = phi::GPUContext; diff --git a/paddle/fluid/framework/pull_dense_worker.cc b/paddle/fluid/framework/pull_dense_worker.cc index 7b61052a20151..5cb310fd9a4a1 100644 --- a/paddle/fluid/framework/pull_dense_worker.cc +++ b/paddle/fluid/framework/pull_dense_worker.cc @@ -69,10 +69,10 @@ void PullDenseWorker::Initialize(const TrainerDesc& param) { fleet_ptr_ = FleetWrapper::GetInstance(); #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) copy_streams_.clear(); #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \ defined(PADDLE_WITH_XPU) places_.clear(); thread_scopes_.clear(); @@ -80,7 +80,7 @@ void PullDenseWorker::Initialize(const TrainerDesc& param) { } void PullDenseWorker::CreatePinVar() { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \ defined(PADDLE_WITH_XPU) // for (auto& v : dense_value_names_) { // for (auto& name : v.second) { @@ -96,7 +96,7 @@ void PullDenseWorker::CreatePinVar() { auto* ptr = root_scope_->Var(name + "pin"); InitializeVariable(ptr, proto::VarType::LOD_TENSOR); phi::DenseTensor* pin_tensor = ptr->GetMutable(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) pin_tensor->mutable_data(tensor->dims(), platform::CUDAPinnedPlace()); #endif @@ -125,7 +125,7 @@ void PullDenseWorker::Wait(std::vector<::std::future>* status_vec) { exit(-1); } status_vec->resize(0); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \ defined(PADDLE_WITH_XPU) for (size_t i = 0; i < places_.size(); ++i) { @@ -144,7 +144,7 @@ void PullDenseWorker::Wait(std::vector<::std::future>* status_vec) { Variable* var = thread_scopes_[i]->FindVar(name); phi::DenseTensor* tensor = var->GetMutable(); float* w = tensor->data(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) memory::Copy(places_[i], w, platform::CUDAPinnedPlace(), @@ -179,7 +179,7 @@ void PullDenseWorker::PullDense(bool force_update) { uint64_t tid = static_cast( dwp_param_.program_config(0).pull_dense_table_id(i)); if (force_update || CheckUpdateParam(tid)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \ defined(PADDLE_WITH_XPU) VLOG(3) << "pull dense " << force_update << " " << tid; fleet_ptr_->PullDenseVarsAsync(*root_scope_, diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc index 58e879a5011c2..cd436becfbe93 100644 --- a/paddle/fluid/framework/section_worker.cc +++ b/paddle/fluid/framework/section_worker.cc @@ -228,7 +228,7 @@ void SectionWorker::TrainFiles() { int64_t max_memory_size = GetEagerDeletionThreshold(); std::unique_ptr gc; if (max_memory_size >= 0) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (platform::is_gpu_place(place_)) { if (IsFastEagerDeletionModeEnabled()) { gc.reset(new UnsafeFastGPUGarbageCollector(place_, max_memory_size)); diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc index 5ef6f53d38d50..9b1e8ccf63e87 100644 --- a/paddle/fluid/framework/tensor_test.cc +++ b/paddle/fluid/framework/tensor_test.cc @@ -114,7 +114,7 @@ TEST(DenseTensor, MutableData) { EXPECT_EQ(static_cast(p2[0]), 1); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) { phi::DenseTensor src_tensor; float* p1 = nullptr; @@ -168,7 +168,7 @@ TEST(DenseTensor, ShareDataWith) { ASSERT_EQ(src_tensor.data(), dst_tensor.data()); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) { phi::DenseTensor src_tensor; phi::DenseTensor dst_tensor; @@ -206,7 +206,7 @@ TEST(DenseTensor, Slice) { EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) { phi::DenseTensor src_tensor; src_tensor.mutable_data(phi::make_ddim({6, 9}), @@ -295,7 +295,7 @@ TEST(DenseTensor, Split) { EXPECT_EQ(src_data_address + 2 * 2 * i * sizeof(int), split_data_address); } } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) { phi::DenseTensor src_tensor; src_tensor.mutable_data(phi::make_ddim({6, 4}), @@ -357,7 +357,7 @@ TEST(DenseTensor, Chunk) { EXPECT_EQ(src_data_address + 2 * 2 * i * sizeof(int), split_data_address); } } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) { phi::DenseTensor src_tensor; src_tensor.mutable_data(phi::make_ddim({6, 4}), diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index d8224cb0dd72b..50f23057c61b1 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -124,7 +124,7 @@ void TensorCopyImpl(const TENSOR& src, "Copy from %s to %s is not supported.", src_place, dst_place)); } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) else if (platform::is_cuda_pinned_place(src_place) && // NOLINT platform::is_cuda_pinned_place(dst_place)) { memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); @@ -377,7 +377,7 @@ void TensorCopySync(const phi::DenseTensor& src, "Copy from %s to %s is not supported.", src_place, dst_place)); } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) else if (platform::is_cuda_pinned_place(src_place) && // NOLINT platform::is_cuda_pinned_place(dst_place)) { memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); @@ -479,7 +479,7 @@ void TensorToStream(std::ostream& os, platform::errors::ResourceExhausted( "tensor size %d overflow when writing tensor", size)); if (platform::is_gpu_place(tensor.place())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB std::unique_ptr buf(new char[kBufSize]); auto& gpu_dev_ctx = static_cast(dev_ctx); @@ -613,7 +613,7 @@ void TensorFromStream(std::istream& is, if (platform::is_gpu_place(dev_ctx.GetPlace()) || platform::is_xpu_place(dev_ctx.GetPlace()) || platform::is_custom_place(dev_ctx.GetPlace())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \ defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE) phi::DenseTensor cpu_tensor; cpu_tensor.Resize(phi::make_ddim(shape)); @@ -686,7 +686,7 @@ void TensorFromStream(std::istream& is, if (platform::is_gpu_place(dev_ctx.GetPlace()) || platform::is_xpu_place(dev_ctx.GetPlace()) || platform::is_custom_place(dev_ctx.GetPlace())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \ defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE) phi::DenseTensor cpu_tensor; cpu_tensor.Resize(phi::make_ddim(dims)); @@ -809,7 +809,7 @@ void TensorFromDLPack(const ::DLTensor& dl_tensor, phi::DenseTensor* dst) { if (dl_tensor.device.device_type == kDLCPU) { memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (dl_tensor.device.device_type == kDLGPU) { platform::CUDAPlace dst_place = platform::CUDAPlace(dl_tensor.device.device_id); @@ -849,7 +849,7 @@ void TensorFromDLPack(const DLManagedTensor* src, phi::DenseTensor* dst) { void* dst_ptr = GetDstPtrByDLDataType(type, dst, dst_place); memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (src->dl_tensor.device.device_type == kDLGPU) { platform::CUDAPlace dst_place = platform::CUDAPlace(src->dl_tensor.device.device_id); diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h index 36a3e968251c9..77ab6f4918caf 100644 --- a/paddle/fluid/framework/tensor_util.h +++ b/paddle/fluid/framework/tensor_util.h @@ -129,7 +129,7 @@ void TensorFromArray(const T* src, if (platform::is_cpu_place(dst_place)) { memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) else if (platform::is_gpu_place(dst_place)) { // NOLINT memory::Copy(dst_place, dst_ptr, @@ -175,7 +175,7 @@ void TensorFromVector(const std::vector& src, if (platform::is_cpu_place(dst_place)) { memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) else if (platform::is_gpu_place(dst_place)) { // NOLINT memory::Copy(dst_place, dst_ptr, @@ -304,7 +304,7 @@ void TensorToVector(const phi::DenseTensor& src, if (platform::is_cpu_place(src.place())) { memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) else if (platform::is_gpu_place(src.place())) { // NOLINT memory::Copy(dst_place, dst_ptr, @@ -346,7 +346,7 @@ inline void TensorToVector(const phi::DenseTensor& src, if (platform::is_cpu_place(src.place())) { memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) else if (platform::is_gpu_place(src.place())) { // NOLINT memory::Copy(dst_place, dst_ptr, diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc index bda2681f57f31..89c4a764b86f2 100644 --- a/paddle/fluid/framework/tensor_util_test.cc +++ b/paddle/fluid/framework/tensor_util_test.cc @@ -58,7 +58,7 @@ TEST(TensorCopy, Tensor) { } EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout()); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) { phi::DenseTensor src_tensor; phi::DenseTensor gpu_tensor; @@ -153,7 +153,7 @@ TEST(TensorFromVector, Tensor) { delete cpu_place; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) { std::vector src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9}; phi::DenseTensor cpu_tensor; @@ -232,7 +232,7 @@ TEST(TensorToVector, Tensor) { EXPECT_EQ(src_ptr[i], dst[i]); } } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) { std::vector src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9}; phi::DenseTensor gpu_tensor; @@ -323,7 +323,7 @@ TEST(TensorFromDLPack, Tensor) { } } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) { std::vector src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9}; phi::DenseTensor cpu_tensor; @@ -489,7 +489,7 @@ TEST(Tensor, FromAndToStream) { EXPECT_EQ(dst_tensor.dims(), src_tensor.dims()); delete place; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) { phi::DenseTensor gpu_tensor; gpu_tensor.Resize({2, 3}); diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h index bf69bed9d4851..9a0d9880f5d04 100644 --- a/paddle/fluid/framework/trainer.h +++ b/paddle/fluid/framework/trainer.h @@ -172,7 +172,7 @@ class HeterServiceContext { int place_num_; Scope* scope_{nullptr}; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) gpuEvent_t event_; #endif std::vector ops_; @@ -204,7 +204,7 @@ class HeterXpuTrainer : public TrainerBase { virtual std::string GetDumpPath(int tid) { return ""; } virtual void InitDumpEnv() {} template -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) void HeterMemCpy(phi::DenseTensor* tensor, phi::DenseTensor* root_tensor, const paddle::platform::Place& thread_place, @@ -242,7 +242,7 @@ class HeterXpuTrainer : public TrainerBase { std::vector place_scopes_; BtObjectPool object_pool_; std::vector places_; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) std::vector copy_streams_; std::vector events_; #endif diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h index 2e188e6caa076..286ee379d82dd 100644 --- a/paddle/fluid/framework/var_type_traits.h +++ b/paddle/fluid/framework/var_type_traits.h @@ -59,7 +59,7 @@ class SparseCsrTensor; namespace paddle { namespace platform { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) class Communicator; class NCCLCommunicator; @@ -189,7 +189,7 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl< FetchList, FeedList, operators::reader::OrderedMultiDeviceLoDTensorBlockingQueueHolder, -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) ncclUniqueId, platform::Communicator, diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc index be715a2a451ad..1d424e81ba5ef 100644 --- a/paddle/fluid/imperative/amp_auto_cast.cc +++ b/paddle/fluid/imperative/amp_auto_cast.cc @@ -138,7 +138,7 @@ AmpOperators::AmpOperators() block_ops_(new std::unordered_set()), unsupported_fp16_ops_(new std::unordered_set()), unsupported_bf16_ops_(new std::unordered_set()) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) auto unsupported_ops_gpu_fp16 = std::get<2>( OpSupportedInfos("GPU", paddle::framework::proto::VarType::FP16)); unsupported_fp16_ops_->insert(unsupported_ops_gpu_fp16.begin(), diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc index 14b9bc5aae0bc..8c78f7af783dd 100644 --- a/paddle/fluid/imperative/gradient_accumulator.cc +++ b/paddle/fluid/imperative/gradient_accumulator.cc @@ -204,7 +204,7 @@ void TensorAdd(const VarType& src, VarType* dst) { } if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PADDLE_TENSOR_ADD(float, phi::GPUContext); PADDLE_TENSOR_ADD(double, phi::GPUContext); PADDLE_TENSOR_ADD(phi::dtype::float16, phi::GPUContext); @@ -313,7 +313,7 @@ void SelectedRowsAddToTensor(const VarType& src, VarType* dst) { return; \ } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (paddle::platform::is_gpu_place(place)) { PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::GPUContext, float); PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::GPUContext, double); @@ -321,7 +321,7 @@ void SelectedRowsAddToTensor(const VarType& src, VarType* dst) { #endif PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::CPUContext, float); PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::CPUContext, double); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) } #endif @@ -364,7 +364,7 @@ void SelectedRowsAddTensor(const VarType& src_selected_rows_var, return; \ } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (platform::is_gpu_place(place)) { PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::GPUContext, float); PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::GPUContext, double); @@ -372,7 +372,7 @@ void SelectedRowsAddTensor(const VarType& src_selected_rows_var, #endif PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::CPUContext, float); PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::CPUContext, double); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) } #endif @@ -425,7 +425,7 @@ std::shared_ptr SelectedRowsMerge(const VarType& src1, return dst_var; \ } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (paddle::platform::is_gpu_place(place)) { PADDLE_SELECTED_ROWS_ADD(phi::GPUContext, float); PADDLE_SELECTED_ROWS_ADD(phi::GPUContext, double); @@ -441,7 +441,7 @@ std::shared_ptr SelectedRowsMerge(const VarType& src1, #if defined(PADDLE_WITH_XPU) } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) } #endif @@ -712,7 +712,7 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr var, } } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (paddle::platform::is_gpu_place(place)) { // sum selected rows firstly for (auto& var_info : tmp_grad_vars_) { @@ -778,7 +778,7 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr var, // Increase count IncreaseCurCnt(); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) } #endif tmp_grad_vars_.clear(); diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc index cda2fad5d7436..6401580096db8 100644 --- a/paddle/fluid/imperative/prepared_operator.cc +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -205,7 +205,7 @@ PreparedOp PrepareImpl( } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (op.CanCUDNNBeUsed(dygraph_exe_ctx, expected_kernel_key.dtype())) { expected_kernel_key.set_backend(phi::Backend::GPUDNN); } @@ -555,7 +555,7 @@ static void PreparedOpRunImpl( if (FLAGS_benchmark) { dev_ctx->Wait(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError()); VLOG(4) << "Operator(" << op.Type() << "): context wait and get last error"; #endif @@ -645,7 +645,7 @@ static void PreparedOpRunPtImpl( if (FLAGS_benchmark) { dev_ctx->Wait(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError()); VLOG(4) << "Operator(" << op.Type() << "): context wait and get last error"; #endif diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index ccb58d320221c..f7b67e027fb7b 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -106,7 +106,7 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists( if (gcs_.count(place) == 0) { std::unique_ptr gc; if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) gc.reset(new framework::DefaultStreamGarbageCollector(place, 0)); VLOG(10) << "Created GarbageCollector at " << place; @@ -116,7 +116,7 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists( "Please recompile or reinstall Paddle with GPU support.")); #endif } else if (platform::is_cuda_pinned_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) gc.reset(new framework::CUDAPinnedGarbageCollector(place, 0)); VLOG(10) << "Created GarbageCollector at " << place; @@ -274,7 +274,7 @@ void Tracer::TraceOpImpl(const std::string& type, try { if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) platform::SetDeviceId(place.device); #else PADDLE_THROW(platform::errors::PreconditionNotMet( diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc index 65e149925e742..2580a2aa8ec2a 100644 --- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc @@ -36,7 +36,7 @@ namespace paddle { namespace inference { namespace analysis { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) { // The parameters are on the cpu, therefore, synchronization is not necessary. if (!argument->use_gpu()) return; @@ -209,7 +209,7 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) { argument->scope_valid(), true, platform::errors::PreconditionNotMet("The scope field should be valid")); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (argument->use_gpu_valid()) { CopyParamsToGpu(argument); } diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h index ee29af1c13308..6ab7d83b8922d 100644 --- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h +++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h @@ -32,7 +32,7 @@ class IrParamsSyncAmongDevicesPass : public AnalysisPass { std::string repr() const override; private: -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) void CopyParamsToGpu(Argument *argument); #endif diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 25c7e7e2a03d4..ea5ad99ea0be0 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -32,7 +32,7 @@ #include "paddle/fluid/inference/tensorrt/helper.h" #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PHI_DECLARE_uint64(initial_gpu_memory_in_mb); #endif @@ -100,7 +100,7 @@ void AnalysisConfig::SetModel(const std::string &prog_file_path, void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb, int device_id, Precision precision_mode) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) use_gpu_ = true; memory_pool_init_size_mb_ = memory_pool_init_size_mb; FLAGS_initial_gpu_memory_in_mb = memory_pool_init_size_mb_; @@ -630,7 +630,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { } void AnalysisConfig::EnableCUDNN() { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) use_cudnn_ = use_gpu_; #else LOG(ERROR) << "Please compile with CUDA first to use cuDNN"; @@ -928,7 +928,7 @@ void AnalysisConfig::Update() { } if (use_gpu() && use_cudnn_) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (!enable_ir_optim_) { LOG(ERROR) << "EnableCUDNN() only works when IR optimization is enabled."; } else { @@ -1145,7 +1145,7 @@ void AnalysisConfig::SetCpuMathLibraryNumThreads( } float AnalysisConfig::fraction_of_gpu_memory_for_pool() const { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) // Get the GPU memory details and calculate the fraction of memory for the // GPU memory pool. size_t gpu_total, gpu_available; diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 56652c2f42cb7..12e893d72781f 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -99,7 +99,7 @@ namespace paddle { namespace { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) void UpdatePrivateDeviceContext(InferGPUContext *gpu_context, GPUContextResource *gpu_resource, Place place_) { @@ -270,7 +270,7 @@ bool PaddleTensorToDenseTensor(const PaddleTensor &pt, false, platform::errors::InvalidArgument( "Only one choice can be made between CPU and XPU.")); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto *dev_ctx = static_cast(pool.Get(place)); auto dst_gpu_place = place; @@ -370,7 +370,7 @@ bool AnalysisPredictor::Init( return true; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) // TODO(inference): Now only gpu with external stream support private // device_context. if (config_.use_gpu_ && config_.use_external_stream_) { @@ -418,7 +418,7 @@ void AnalysisPredictor::InitPlace() { platform::errors::InvalidArgument( "Only one choice can be made between CPU and XPU.")); place_ = paddle::platform::CUDAPlace(config_.gpu_device_id()); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (config_.thread_local_stream_enabled()) { LOG_FIRST_N(WARNING, 1) << "We will remove this interface in the future. " "Please use config.SetExecStream instead."; @@ -489,14 +489,14 @@ void AnalysisPredictor::InitPlace() { } void AnalysisPredictor::InitResourceManager(void *stream) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) predictor_stream_ = ResourceManager::Instance().InitGPUResource(place_, stream); #endif } void AnalysisPredictor::InitDeviceContexts() { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) // Init GPUContext. if (place_.GetType() == phi::AllocationType::GPU) { device_contexts_.emplace( @@ -534,7 +534,7 @@ void AnalysisPredictor::InitDeviceContexts() { } void *AnalysisPredictor::GetExecStream() const { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (place_.GetType() == phi::AllocationType::GPU) { if (private_context_) { return predictor_stream_; @@ -2151,7 +2151,7 @@ bool AnalysisPredictor::ZeroCopyRun() { return true; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) { if (!private_context_) { PADDLE_THROW(platform::errors::Fatal( @@ -2160,8 +2160,10 @@ bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) { } if (stream != predictor_stream_) { -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) hipStreamSynchronize(static_cast(predictor_stream_)); +#elif defined(PADDLE_WITH_HIP) + musaStreamSynchronize(static_cast(predictor_stream_)); #else cudaStreamSynchronize(static_cast(predictor_stream_)); #endif @@ -2199,11 +2201,13 @@ void AnalysisPredictor::HookCollectShapeRangeInfo() { paddle::platform::DeviceContextPool &pool = paddle::platform::DeviceContextPool::Instance(); if (config_.use_gpu()) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) auto *dev_ctx = pool.Get(place_); auto stream = static_cast(dev_ctx)->stream(); -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) hipStreamSynchronize(stream); +#elif defined(PADDLE_WITH_MUSA) + musaStreamSynchronize(stream); #else cudaStreamSynchronize(stream); #endif @@ -2595,7 +2599,7 @@ AnalysisPredictor::~AnalysisPredictor() { if (config_.shape_range_info_collected()) { StatisticShapeRangeInfo(); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (predictor_stream_ != nullptr) { ResourceManager::Instance().DestroyGPUResource(predictor_stream_); } diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index bde6ca48741ad..36c5d13a84521 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -220,7 +220,7 @@ class AnalysisPredictor : public PaddlePredictor { /// bool ZeroCopyRun() override; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) // Note: Can only be used under thread_local semantics. bool ExpRunWithExternalStream(const gpuStream_t stream); #endif diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index 28353150c265c..f69a434f36f83 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -250,7 +250,7 @@ bool NativePaddlePredictor::SetFeed(const std::vector &inputs, false, platform::errors::InvalidArgument( "Only one choice can be made between CPU and XPU.")); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto *dev_ctx = static_cast(pool.Get(place_)); diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc index 37ee2b4df643d..8791d6dfe0266 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc @@ -108,7 +108,7 @@ T *Tensor::mutable_data(PlaceType place) { return tensor->mutable_data(paddle::platform::CPUPlace()); } case static_cast(PlaceType::kGPU): { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) paddle::platform::CUDAPlace gpu_place(device_); auto *dev_ctxs = reinterpret_castmutable_data(paddle::platform::CPUPlace()); std::memcpy(static_cast(t_data), data, ele_size); } else if (place_ == PlaceType::kGPU) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) paddle::platform::CUDAPlace gpu_place(device_); auto *dev_ctxs = reinterpret_caststream()); -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) hipStreamSynchronize(dev_ctx->stream()); +#elif defined(PADDLE_WITH_MUSA) + musaStreamSynchronize(dev_ctx->stream()); #else // async, return stream if (nullptr != exec_stream) { @@ -821,7 +823,7 @@ void InternalUtils::CopyFromCpuWithIoStream(paddle_infer::Tensor *t, auto *t_data = tensor->mutable_data(paddle::platform::CPUPlace()); std::memcpy(static_cast(t_data), data, ele_size); } else if (t->place_ == PlaceType::kGPU) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) paddle::platform::CUDAPlace gpu_place(t->device_); auto *t_data = tensor->mutable_data(gpu_place); paddle::memory::Copy(gpu_place, @@ -891,7 +893,7 @@ void InternalUtils::CopyToCpuWithIoStream(paddle_infer::Tensor *t, std::memcpy(static_cast(data), t_data, ele_num * sizeof(T)); #endif } else if (t->place_ == PlaceType::kGPU) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) paddle::memory::Copy(paddle::platform::CPUPlace(), static_cast(data), t_place, diff --git a/paddle/fluid/inference/api/infer_context.cc b/paddle/fluid/inference/api/infer_context.cc index 533363f1b25da..57a7625aaef58 100644 --- a/paddle/fluid/inference/api/infer_context.cc +++ b/paddle/fluid/inference/api/infer_context.cc @@ -21,7 +21,7 @@ namespace paddle { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) InferGPUContext::InferGPUContext(const phi::Place& place) : phi::GPUContext(place, false) {} #endif diff --git a/paddle/fluid/inference/api/infer_context.h b/paddle/fluid/inference/api/infer_context.h index 2b5c4e974eb08..19f285ad78b65 100644 --- a/paddle/fluid/inference/api/infer_context.h +++ b/paddle/fluid/inference/api/infer_context.h @@ -26,7 +26,7 @@ class InferCPUContext : public phi::CPUContext { using phi::CPUContext::SetEigenDevice; }; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) class InferGPUContext : public phi::GPUContext { public: explicit InferGPUContext(const phi::Place& place); diff --git a/paddle/fluid/inference/api/resource_manager.cc b/paddle/fluid/inference/api/resource_manager.cc index 3f06ee5722af9..9f5df0edfa06c 100644 --- a/paddle/fluid/inference/api/resource_manager.cc +++ b/paddle/fluid/inference/api/resource_manager.cc @@ -44,7 +44,7 @@ namespace paddle { namespace internal { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) class EigenGpuStreamDevice : public Eigen::StreamInterface { public: EigenGpuStreamDevice() : scratch_(nullptr), semaphore_(nullptr) { @@ -99,9 +99,12 @@ class EigenGpuStreamDevice : public Eigen::StreamInterface { if (semaphore_ == NULL) { char* scratch = static_cast(scratchpad()) + Eigen::kGpuScratchSize; semaphore_ = reinterpret_cast(scratch); -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_GPU_SUCCESS( hipMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream_)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream_)); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream_)); @@ -132,7 +135,7 @@ void CPUContextResource::InitCPUResource() { CPUContextResource::CPUContextResource() { InitCPUResource(); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) GPUContextResource::GPUContextResource(const phi::Place& place, void* stream) : place_(place) { InitGPUResource(stream); @@ -156,8 +159,10 @@ void GPUContextResource::InitGPUResource(void* stream) { void GPUContextResource::DestroyGPUResource() { if (owned_stream_) { -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(stream_)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamDestroy(stream_)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(stream_)); #endif @@ -375,7 +380,7 @@ CPUContextResource* ResourceManager::GetCPUResource() const { return cpu_resource_.get(); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) void* ResourceManager::InitGPUResource(const phi::Place& place, void* stream) { std::lock_guard lock_gurad(gpu_mutex_); if (gpu_resources_.count(stream)) { diff --git a/paddle/fluid/inference/api/resource_manager.h b/paddle/fluid/inference/api/resource_manager.h index e14de1c2ffc86..9686761029374 100644 --- a/paddle/fluid/inference/api/resource_manager.h +++ b/paddle/fluid/inference/api/resource_manager.h @@ -25,7 +25,7 @@ #include "paddle/phi/common/place.h" #include "unsupported/Eigen/CXX11/Tensor" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/fluid/platform/device/gpu/gpu_types.h" #include "paddle/phi/backends/gpu/forwards.h" #include "paddle/phi/backends/gpu/gpu_decls.h" @@ -49,7 +49,7 @@ class CPUContextResource { std::unique_ptr cpu_eigen_device_; }; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) class GPUContextResource { public: explicit GPUContextResource(const phi::Place& place, void* stream); @@ -141,7 +141,7 @@ class ResourceManager { std::mutex cpu_mutex_; std::unique_ptr cpu_resource_{nullptr}; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) // GPU Resource public: void* InitGPUResource(const phi::Place& place, void* stream); diff --git a/paddle/fluid/inference/lite/tensor_utils.cc b/paddle/fluid/inference/lite/tensor_utils.cc index 6de5f9cfa0ca1..509b3f0b993f8 100644 --- a/paddle/fluid/inference/lite/tensor_utils.cc +++ b/paddle/fluid/inference/lite/tensor_utils.cc @@ -127,7 +127,7 @@ void MemoryCopyAsync(const platform::Place& dst_place, if (platform::is_cpu_place(dst_place) && platform::is_cpu_place(src_place)) { memory::Copy(cpu_place, dst_data, cpu_place, src_data, size); } else { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (platform::is_cpu_place(dst_place) && platform::is_gpu_place(src_place)) { PADDLE_THROW(platform::errors::Unimplemented( diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu index 2500f624967c6..ed2993e7a39e7 100644 --- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu @@ -215,9 +215,12 @@ void QkvToContextPluginDynamic::configurePlugin( fake_qk_bias_ = reinterpret_cast( tensor_.mutable_data(platform::CUDAPlace(device_id))); int64_t size = sizeof(int32_t) * batch * seq_len * seq_len * head_number_; -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_GPU_SUCCESS( hipMemsetAsync(fake_qk_bias_, 0, size, dev_ctx.stream())); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaMemsetAsync(fake_qk_bias_, 0, size, dev_ctx.stream())); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaMemsetAsync(fake_qk_bias_, 0, size, dev_ctx.stream())); diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index 1a39590398911..aa96228a694d5 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -19,7 +19,7 @@ set(ALLOCATOR_SRCS buddy_allocator.cc system_allocator.cc) -if(WITH_GPU OR WITH_ROCM) +if(WITH_GPU OR WITH_ROCM OR WITH_MUSA) list( APPEND ALLOCATOR_SRCS diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 07e55115ba130..41635de256abe 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -27,7 +27,7 @@ #include "paddle/fluid/platform/place.h" #include "paddle/phi/core/macros.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include #include "paddle/fluid/memory/allocation/cuda_allocator.h" @@ -164,7 +164,7 @@ class AllocatorFacadePrivate { public: using AllocatorMap = std::map>; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) using CUDAAllocatorMap = std::map>>; @@ -187,7 +187,7 @@ class AllocatorFacadePrivate { InitNaiveBestFitIPUAllocator(platform::IPUPlace(dev_id)); } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) { InitNaiveBestFitCUDAAllocator(platform::CUDAPlace(dev_id)); } @@ -214,7 +214,7 @@ class AllocatorFacadePrivate { case AllocatorStrategy::kAutoGrowth: { InitNaiveBestFitCPUAllocator(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) allow_free_idle_chunk_ = allow_free_idle_chunk; for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) { InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id), @@ -286,7 +286,7 @@ class AllocatorFacadePrivate { InitNaiveBestFitIPUAllocator(platform::IPUPlace(dev_id)); } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) { InitThreadLocalCUDAAllocator(platform::CUDAPlace(dev_id)); } @@ -345,7 +345,7 @@ class AllocatorFacadePrivate { LIKELY(FLAGS_use_system_allocator == false); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) bool HasCUDAAllocator(const platform::CUDAPlace& place, gpuStream_t stream) { auto it = cuda_allocators_.find(place); if (it == cuda_allocators_.end()) { @@ -594,7 +594,7 @@ class AllocatorFacadePrivate { #endif } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) void InitNaiveBestFitCUDAPinnedAllocator() { allocators_[platform::CUDAPinnedPlace()] = std::make_shared(platform::CUDAPinnedPlace()); @@ -1038,7 +1038,7 @@ class AllocatorFacadePrivate { system_allocators_[p] = std::make_shared(p); } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) system_allocators_[platform::CUDAPinnedPlace()] = std::make_shared(); int device_count = platform::GetGPUDeviceCount(); @@ -1064,7 +1064,7 @@ class AllocatorFacadePrivate { if (!zero_size_allocators_.empty()) return; std::vector places; places.emplace_back(platform::CPUPlace()); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) int device_count = platform::GetGPUDeviceCount(); for (int dev_id = 0; dev_id < device_count; ++dev_id) { places.emplace_back(platform::CUDAPlace(dev_id)); @@ -1112,7 +1112,7 @@ class AllocatorFacadePrivate { CheckAllocThreadSafe(allocators_); CheckAllocThreadSafe(zero_size_allocators_); CheckAllocThreadSafe(system_allocators_); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (is_stream_safe_cuda_allocator_used_) { CheckCUDAAllocThreadSafe(cuda_allocators_); } @@ -1145,7 +1145,7 @@ class AllocatorFacadePrivate { } } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) // a standalone CUDA allocator to support multi-stream GC in new executor std::map> default_stream_safe_cuda_allocators_; @@ -1252,7 +1252,7 @@ std::shared_ptr AllocatorFacade::AllocShared( AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size, const phi::Stream& stream) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) AllocatorFacadePrivate* m = GetPrivate(); if (!m->IsStreamSafeCUDAAllocatorUsed()) { VLOG(6) << "Warning: StreamSafeCUDAAllocator is not used!"; @@ -1278,7 +1278,7 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, bool AllocatorFacade::InSameStream( const std::shared_ptr& allocation, const phi::Stream& stream) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) gpuStream_t s = reinterpret_cast(stream.id()); return s == GetStream(allocation); #else @@ -1290,7 +1290,7 @@ bool AllocatorFacade::IsStreamSafeCUDAAllocatorUsed() { return GetPrivate()->IsStreamSafeCUDAAllocatorUsed(); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place, gpuStream_t stream) { AllocatorFacadePrivate* m = GetPrivate(); diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h index a1f21a5e69359..6f1b495891338 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.h +++ b/paddle/fluid/memory/allocation/allocator_facade.h @@ -76,7 +76,7 @@ class AllocatorFacade { bool IsStreamSafeCUDAAllocatorUsed(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) // TODO(zhiqiu): change gpuStream_t to phi::Stream if needed. uint64_t Release(const platform::CUDAPlace& place, gpuStream_t stream); void RecordStream(std::shared_ptr allocation, gpuStream_t stream); diff --git a/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc b/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc index 1e09c43c4f12f..0cf8089f5a65f 100644 --- a/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc +++ b/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc @@ -17,7 +17,7 @@ #include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/phi/core/flags.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PHI_DECLARE_double(fraction_of_gpu_memory_to_use); PHI_DECLARE_double(fraction_of_cuda_pinned_memory_to_use); PHI_DECLARE_uint64(initial_gpu_memory_in_mb); @@ -46,7 +46,7 @@ void AllocateTestCases() { ASSERT_EQ(cpu_allocation->size(), size); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) { place = platform::CUDAPlace(0); size = 1024; @@ -82,7 +82,7 @@ void AllocateTestCases() { } TEST(Allocator, SpecifyGpuMemory) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) // Set to 0.0 to test FLAGS_initial_gpu_memory_in_mb and // FLAGS_reallocate_gpu_memory_in_mb FLAGS_fraction_of_gpu_memory_to_use = 0.0; diff --git a/paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc b/paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc index 63e3eab3256c9..b60b53bc28f3c 100644 --- a/paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc +++ b/paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc @@ -17,7 +17,7 @@ #include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/phi/core/flags.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PHI_DECLARE_double(fraction_of_gpu_memory_to_use); PHI_DECLARE_double(fraction_of_cuda_pinned_memory_to_use); PHI_DECLARE_uint64(initial_gpu_memory_in_mb); @@ -46,7 +46,7 @@ void AllocateTestCases() { ASSERT_EQ(cpu_allocation->size(), size); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) { place = platform::CUDAPlace(0); size = 1024; @@ -82,7 +82,7 @@ void AllocateTestCases() { } TEST(Allocator, Allocator) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) FLAGS_fraction_of_gpu_memory_to_use = 0.01; FLAGS_gpu_allocator_retry_time = 500; FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5; diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc index bfd05b6b323fe..b4d4699f1f039 100644 --- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc @@ -23,7 +23,7 @@ #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/phi/core/flags.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PHI_DECLARE_double(fraction_of_gpu_memory_to_use); PHI_DECLARE_double(fraction_of_cuda_pinned_memory_to_use); DECLARE_int64(gpu_allocator_retry_time); @@ -41,7 +41,7 @@ static inline size_t AlignTo(size_t size, size_t alignment) { } TEST(allocator, allocator) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) FLAGS_fraction_of_gpu_memory_to_use = 0.01; FLAGS_gpu_allocator_retry_time = 500; FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5; @@ -102,7 +102,7 @@ TEST(allocator, allocator) { TEST(multithread_allocate, test_segfault) { FLAGS_allocator_strategy = "auto_growth"; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) std::mutex mtx; std::condition_variable cv; bool flag = false; diff --git a/paddle/fluid/memory/allocation/buddy_allocator.cc b/paddle/fluid/memory/allocation/buddy_allocator.cc index 8de464754cb35..9c1402374b323 100644 --- a/paddle/fluid/memory/allocation/buddy_allocator.cc +++ b/paddle/fluid/memory/allocation/buddy_allocator.cc @@ -19,7 +19,7 @@ limitations under the License. */ #include "glog/logging.h" #include "paddle/phi/core/flags.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #define USE_DEVICE PHI_DECLARE_uint64(reallocate_gpu_memory_in_mb); #endif @@ -53,7 +53,7 @@ BuddyAllocator::BuddyAllocator( platform::PlaceHelper::CreatePlace(dev_type)); }; } else { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) init_allocate_size_func_ = &platform::GpuInitAllocSize; re_allocate_size_func_ = &platform::GpuReallocSize; #endif @@ -249,7 +249,7 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool( allocate_bytes = DeviceAllocateSize( init_allocate_size_func_, re_allocate_size_func_, request_bytes); #else -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) allocate_bytes = DeviceAllocateSize( &platform::GpuInitAllocSize, &platform::GpuReallocSize, request_bytes); #endif diff --git a/paddle/fluid/memory/allocation/buddy_allocator_test.cc b/paddle/fluid/memory/allocation/buddy_allocator_test.cc index 1aeb1722d0ec8..6b99499824cfb 100644 --- a/paddle/fluid/memory/allocation/buddy_allocator_test.cc +++ b/paddle/fluid/memory/allocation/buddy_allocator_test.cc @@ -26,7 +26,7 @@ limitations under the License. */ #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/phi/core/flags.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PHI_DECLARE_double(fraction_of_gpu_memory_to_use); PHI_DECLARE_uint64(initial_gpu_memory_in_mb); PHI_DECLARE_uint64(reallocate_gpu_memory_in_mb); @@ -77,7 +77,7 @@ int* TestBuddyAllocator(BuddyAllocator* allocator, return nullptr; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) TEST(BuddyAllocator, GpuFraction) { // In a 16 GB machine, the pool size will be about 160 MB FLAGS_fraction_of_gpu_memory_to_use = 0.01; diff --git a/paddle/fluid/memory/allocation/cuda_allocator.cc b/paddle/fluid/memory/allocation/cuda_allocator.cc index 781addd7dba60..da5fdc829e8c0 100644 --- a/paddle/fluid/memory/allocation/cuda_allocator.cc +++ b/paddle/fluid/memory/allocation/cuda_allocator.cc @@ -19,6 +19,12 @@ #include #endif +#ifdef PADDLE_WITH_MUSA +#include +#include +#endif + +#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP #include #endif diff --git a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h index 7286f84160c6a..1401aeb7a11be 100644 --- a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h +++ b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h @@ -79,9 +79,12 @@ class GPUContextAllocator : public Allocator { gpuStream_t default_stream) : place_(place), default_stream_(default_stream) { platform::CUDADeviceGuard guard(place_.device); -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_GPU_SUCCESS( hipEventCreateWithFlags(&event_, hipEventDisableTiming)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaEventCreate(&event_, musaEventDisableTiming)); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaEventCreate(&event_, cudaEventDisableTiming)); @@ -91,9 +94,10 @@ class GPUContextAllocator : public Allocator { ~GPUContextAllocator() { if (event_) { platform::CUDADeviceGuard guard(place_.device); -#ifdef PADDLE_WITH_HIP - +#if defined(PADDLE_WITH_HIP) PADDLE_WARN_GPU_SUCCESS(hipEventDestroy(event_)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_WARN_GPU_SUCCESS(musaEventDestroy(event_)); #else PADDLE_WARN_GPU_SUCCESS(cudaEventDestroy(event_)); #endif @@ -110,9 +114,12 @@ class GPUContextAllocator : public Allocator { auto allocation = new GPUContextAllocation( static_unique_ptr_cast(memory::Alloc(place_, size))); // Wait for the event on stream -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, default_stream_)); PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(default_stream_, event_, 0)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event_, default_stream_)); + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(default_stream_, event_, 0)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, default_stream_)); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(default_stream_, event_, 0)); diff --git a/paddle/fluid/memory/allocation/cuda_managed_allocator.cc b/paddle/fluid/memory/allocation/cuda_managed_allocator.cc index 06e9fbe88827b..d1b68212736ee 100644 --- a/paddle/fluid/memory/allocation/cuda_managed_allocator.cc +++ b/paddle/fluid/memory/allocation/cuda_managed_allocator.cc @@ -19,6 +19,11 @@ #include #endif +#ifdef PADDLE_WITH_MUSA +#include +#include +#endif + #ifdef PADDLE_WITH_HIP #include #endif diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc index e436e6c439081..27a6e3857f224 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc @@ -26,7 +26,7 @@ #include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/split.h" #include "paddle/phi/common/place.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/fluid/platform/cuda_device_guard.h" #endif #include "paddle/fluid/platform/flags.h" @@ -213,7 +213,7 @@ size_t Used(const platform::XPUPlace &place) { } // For CUDA -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) class GPUBuddyAllocatorList { private: GPUBuddyAllocatorList() : devices_(platform::GetSelectedDevices()) { @@ -294,7 +294,7 @@ size_t Used(const platform::CUDAPlace &place) { template <> void *Alloc(const platform::CUDAPlace &place, size_t size) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) auto *buddy_allocator = GetGPUBuddyAllocator(place.device); auto *ptr = buddy_allocator->Alloc(size); if (ptr == nullptr) { @@ -313,8 +313,10 @@ void *Alloc(const platform::CUDAPlace &place, string::HumanReadableSize(Used(place)))); } else { if (FLAGS_init_allocated_mem) { -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) hipMemset(ptr, 0xEF, size); +#elif defined(PADDLE_WITH_MUSA) + musaMemset(ptr, 0xEF, size); #else cudaMemset(ptr, 0xEF, size); #endif @@ -331,7 +333,7 @@ template <> void Free(const platform::CUDAPlace &place, void *p, size_t size) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) GetGPUBuddyAllocator(place.device)->Free(p); #else PADDLE_THROW(platform::errors::PermissionDenied( @@ -341,7 +343,7 @@ void Free(const platform::CUDAPlace &place, template <> uint64_t Release(const platform::CUDAPlace &place) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) return GetGPUBuddyAllocator(place.device)->Release(); #else PADDLE_THROW(platform::errors::PermissionDenied( @@ -349,7 +351,7 @@ uint64_t Release(const platform::CUDAPlace &place) { #endif } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) BuddyAllocator *GetCUDAPinnedBuddyAllocator() { static std::once_flag init_flag; static BuddyAllocator *ba = nullptr; @@ -367,7 +369,7 @@ BuddyAllocator *GetCUDAPinnedBuddyAllocator() { template <> size_t Used(const platform::CUDAPinnedPlace &place) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) return GetCUDAPinnedBuddyAllocator()->Used(); #else PADDLE_THROW(platform::errors::PermissionDenied( @@ -378,7 +380,7 @@ size_t Used(const platform::CUDAPinnedPlace &place) { template <> void *Alloc(const platform::CUDAPinnedPlace &place, size_t size) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); auto *buddy_allocator = GetCUDAPinnedBuddyAllocator(); void *ptr = buddy_allocator->Alloc(size); @@ -401,7 +403,7 @@ template <> void Free(const platform::CUDAPinnedPlace &place, void *p, size_t size) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) VLOG(10) << "Free " << size << " bytes on " << platform::Place(place); GetCUDAPinnedBuddyAllocator()->Free(p); #else @@ -413,7 +415,7 @@ void Free(const platform::CUDAPinnedPlace &place, template <> uint64_t Release( const platform::CUDAPinnedPlace &place) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) VLOG(10) << "Release on " << platform::Place(place); return GetCUDAPinnedBuddyAllocator()->Release(); #else @@ -602,7 +604,7 @@ size_t Usage::operator()(const platform::CPUPlace &cpu) const { } size_t Usage::operator()(const platform::CUDAPlace &gpu) const { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) return Used(gpu); #else PADDLE_THROW(platform::errors::PermissionDenied( @@ -611,7 +613,7 @@ size_t Usage::operator()(const platform::CUDAPlace &gpu) const { } size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) return Used(cuda_pinned); #else PADDLE_THROW(platform::errors::PermissionDenied( diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc index 37da748ee9c96..5ad4a729a6692 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc @@ -33,7 +33,7 @@ TEST(NaiveBestFitAllocatorTest, CpuAlloc) { alloc.Release(platform::CPUPlace()); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) TEST(NaiveBestFitAllocatorTest, GpuAlloc) { NaiveBestFitAllocator alloc{platform::CUDAPlace(0)}; { diff --git a/paddle/fluid/memory/allocation/pinned_allocator.cc b/paddle/fluid/memory/allocation/pinned_allocator.cc index f1c0178fafc02..33c6ca55880cd 100644 --- a/paddle/fluid/memory/allocation/pinned_allocator.cc +++ b/paddle/fluid/memory/allocation/pinned_allocator.cc @@ -21,8 +21,10 @@ namespace memory { namespace allocation { bool CPUPinnedAllocator::IsAllocThreadSafe() const { return true; } void CPUPinnedAllocator::FreeImpl(phi::Allocation *allocation) { -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_GPU_SUCCESS(hipHostFree(allocation->ptr())); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaHostFree(allocation->ptr())); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaFreeHost(allocation->ptr())); #endif @@ -35,8 +37,10 @@ void CPUPinnedAllocator::FreeImpl(phi::Allocation *allocation) { } phi::Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) { void *ptr; -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_GPU_SUCCESS(hipHostMalloc(&ptr, size, hipHostMallocPortable)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaHostMalloc(&ptr, size, musaHostMallocPortable)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaHostAlloc(&ptr, size, cudaHostAllocPortable)); #endif diff --git a/paddle/fluid/memory/allocation/retry_allocator_test.cc b/paddle/fluid/memory/allocation/retry_allocator_test.cc index d1872ee00b7b7..ef8692b64cc51 100644 --- a/paddle/fluid/memory/allocation/retry_allocator_test.cc +++ b/paddle/fluid/memory/allocation/retry_allocator_test.cc @@ -19,7 +19,7 @@ #include "gtest/gtest.h" #include "paddle/fluid/memory/allocation/best_fit_allocator.h" #include "paddle/fluid/memory/allocation/cpu_allocator.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/fluid/memory/allocation/cuda_allocator.h" #endif @@ -114,7 +114,7 @@ TEST(RetryAllocator, RetryAllocatorLastAllocFailure) { } } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) { platform::CUDAPlace p(0); RetryAllocator allocator(std::make_shared(p), retry_ms); diff --git a/paddle/fluid/memory/allocation/system_allocator.cc b/paddle/fluid/memory/allocation/system_allocator.cc index 210be01669775..4234b615c823b 100644 --- a/paddle/fluid/memory/allocation/system_allocator.cc +++ b/paddle/fluid/memory/allocation/system_allocator.cc @@ -33,7 +33,7 @@ limitations under the License. */ #include "paddle/phi/backends/cpu/cpu_info.h" #include "paddle/phi/core/flags.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/fluid/platform/cuda_device_guard.h" #endif @@ -120,7 +120,7 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) { bool CPUAllocator::UseGpu() const { return false; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) void* GPUAllocator::Alloc(size_t* index, size_t size) { // CUDA documentation doesn't explain if cudaMalloc returns nullptr @@ -214,8 +214,10 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) { void* p; // PINNED memory is visible to all CUDA contexts. -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) hipError_t result = hipHostMalloc(&p, size, hipHostMallocPortable); +#elif defined(PADDLE_WITH_MUSA) + musaError_t result = musaHostMalloc(&p, size, musaHostMallocPortable); #else cudaError_t result = cudaHostAlloc(&p, size, cudaHostAllocPortable); #endif diff --git a/paddle/fluid/memory/allocation/system_allocator.h b/paddle/fluid/memory/allocation/system_allocator.h index 67376a3e39a22..b2cce04a04d37 100644 --- a/paddle/fluid/memory/allocation/system_allocator.h +++ b/paddle/fluid/memory/allocation/system_allocator.h @@ -43,7 +43,7 @@ class CPUAllocator : public SystemAllocator { virtual bool UseGpu() const; }; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) class GPUAllocator : public SystemAllocator { public: explicit GPUAllocator(int gpu_id) : gpu_id_(gpu_id) {} diff --git a/paddle/fluid/memory/allocation/system_allocator_test.cc b/paddle/fluid/memory/allocation/system_allocator_test.cc index e04d14f0adfde..a296755c12725 100644 --- a/paddle/fluid/memory/allocation/system_allocator_test.cc +++ b/paddle/fluid/memory/allocation/system_allocator_test.cc @@ -57,7 +57,7 @@ TEST(CPUAllocator, LockMem) { TestAllocator(&a, 0); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) TEST(GPUAllocator, Alloc) { paddle::memory::detail::GPUAllocator a(0); TestAllocator(&a, 2048); diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc index 46f9b1189cb68..f86d4f0f256ca 100644 --- a/paddle/fluid/memory/malloc.cc +++ b/paddle/fluid/memory/malloc.cc @@ -57,7 +57,7 @@ void* GetBasePtr(const std::shared_ptr& allocation) { return allocation::AllocatorFacade::Instance().GetBasePtr(allocation); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) uint64_t Release(const platform::CUDAPlace& place, gpuStream_t stream) { return allocation::AllocatorFacade::Instance().Release(place, stream); } diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h index b8f5f0289c4bc..bd67a4eeefcac 100644 --- a/paddle/fluid/memory/malloc.h +++ b/paddle/fluid/memory/malloc.h @@ -48,7 +48,7 @@ extern bool InSameStream(const std::shared_ptr& allocation, extern void* GetBasePtr(const std::shared_ptr& allocation); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) extern uint64_t Release(const platform::CUDAPlace& place, gpuStream_t stream); void RecordStream(std::shared_ptr allocation, gpuStream_t stream); diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc index 4a56a01e640bf..45b2ec3ca3875 100644 --- a/paddle/fluid/memory/memcpy.cc +++ b/paddle/fluid/memory/memcpy.cc @@ -256,10 +256,10 @@ void Copy(phi::Place dst_place, #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024; // 64K -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) inline void SyncCUDAStream() { #if !defined(_WIN32) hipStreamSynchronize(0); @@ -271,6 +271,18 @@ inline void SyncCUDAStream() { } #endif } +#elif defined(PADDLE_WITH_MUSA) +inline void SyncCUDAStream() { +#if !defined(_WIN32) + musaStreamSynchronize(0); +#else + musaError_t e_sync = musaSuccess; + while (e_sync = musaStreamQuery(0)) { + if (e_sync == musaErrorNotReady) continue; + break; + } +#endif +} #else inline void SyncCUDAStream() { #if !defined(_WIN32) @@ -307,12 +319,18 @@ void Copy( if (stream) { platform::RecordEvent record_event( "GpuMemcpyAsync:GPU->CPU", platform::TracerEventType::UserDefined, 1); -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost, reinterpret_cast(stream)); +#elif defined(PADDLE_WITH_MUSA) + platform::GpuMemcpyAsync(dst, + src, + num, + musaMemcpyDeviceToHost, + reinterpret_cast(stream)); #else platform::GpuMemcpyAsync(dst, src, @@ -323,8 +341,10 @@ void Copy( } else { platform::RecordEvent record_event( "GpuMemcpySync:GPU->CPU", platform::TracerEventType::UserDefined, 1); -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) platform::GpuMemcpySync(dst, src, num, hipMemcpyDeviceToHost); +#elif defined(PADDLE_WITH_MUSA) + platform::GpuMemcpySync(dst, src, num, musaMemcpyDeviceToHost); #else platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost); #endif @@ -351,12 +371,18 @@ void Copy( if (stream) { platform::RecordEvent record_event( "GpuMemcpyAsync:CPU->GPU", platform::TracerEventType::UserDefined, 1); -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice, reinterpret_cast(stream)); +#elif defined(PADDLE_WITH_MUSA) + platform::GpuMemcpyAsync(dst, + src, + num, + musaMemcpyHostToDevice, + reinterpret_cast(stream)); #else platform::GpuMemcpyAsync(dst, src, @@ -367,8 +393,10 @@ void Copy( } else { platform::RecordEvent record_event( "GpuMemcpySync:CPU->GPU", platform::TracerEventType::UserDefined, 1); -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) platform::GpuMemcpySync(dst, src, num, hipMemcpyHostToDevice); +#elif defined(PADDLE_WITH_MUSA) + platform::GpuMemcpySync(dst, src, num, musaMemcpyHostToDevice); #else platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice); #endif @@ -397,12 +425,18 @@ void Copy( platform::RecordEvent record_event("GpuMemcpyAsync(same_gpu):GPU->GPU", platform::TracerEventType::UserDefined, 1); -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToDevice, reinterpret_cast(stream)); +#elif defined(PADDLE_WITH_MUSA) + platform::GpuMemcpyAsync(dst, + src, + num, + musaMemcpyDeviceToDevice, + reinterpret_cast(stream)); #else platform::GpuMemcpyAsync(dst, src, @@ -414,8 +448,10 @@ void Copy( platform::RecordEvent record_event("GpuMemcpySync(same_gpu):GPU->GPU", platform::TracerEventType::UserDefined, 1); -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) platform::GpuMemcpySync(dst, src, num, hipMemcpyDeviceToDevice); +#elif defined(PADDLE_WITH_MUSA) + platform::GpuMemcpySync(dst, src, num, musaMemcpyDeviceToDevice); #else platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToDevice); #endif @@ -496,12 +532,18 @@ void Copy( platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CUDAPinned", platform::TracerEventType::UserDefined, 1); -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost, reinterpret_cast(stream)); +#elif defined(PADDLE_WITH_MUSA) + platform::GpuMemcpyAsync(dst, + src, + num, + musaMemcpyDeviceToHost, + reinterpret_cast(stream)); #else platform::GpuMemcpyAsync(dst, src, @@ -513,8 +555,10 @@ void Copy( platform::RecordEvent record_event("GpuMemcpySync:GPU->CUDAPinned", platform::TracerEventType::UserDefined, 1); -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) platform::GpuMemcpySync(dst, src, num, hipMemcpyDeviceToHost); +#elif defined(PADDLE_WITH_MUSA) + platform::GpuMemcpySync(dst, src, num, musaMemcpyDeviceToHost); #else platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost); #endif @@ -538,12 +582,18 @@ void Copy( platform::RecordEvent record_event("GpuMemcpyAsync:CUDAPinned->GPU", platform::TracerEventType::UserDefined, 1); -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice, reinterpret_cast(stream)); +#elif defined(PADDLE_WITH_MUSA) + platform::GpuMemcpyAsync(dst, + src, + num, + musaMemcpyHostToDevice, + reinterpret_cast(stream)); #else platform::GpuMemcpyAsync(dst, src, @@ -555,8 +605,10 @@ void Copy( platform::RecordEvent record_event("GpuMemcpySync:CUDAPinned->GPU", platform::TracerEventType::UserDefined, 1); -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) platform::GpuMemcpySync(dst, src, num, hipMemcpyHostToDevice); +#elif defined(PADDLE_WITH_MUSA) + platform::GpuMemcpySync(dst, src, num, musaMemcpyHostToDevice); #else platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice); #endif @@ -746,7 +798,7 @@ void Copy(phi::Place dst_place, dst_place.GetType() == phi::AllocationType::CPU) { std::memcpy(dst, src, num); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) else if (src_place.GetType() == phi::AllocationType::CPU && // NOLINT dst_place.GetType() == phi::AllocationType::GPUPINNED) { std::memcpy(dst, src, num); diff --git a/paddle/fluid/memory/memory_stats_test.cc b/paddle/fluid/memory/memory_stats_test.cc index 081f0d3d78c13..6afc2a852f0d6 100644 --- a/paddle/fluid/memory/memory_stats_test.cc +++ b/paddle/fluid/memory/memory_stats_test.cc @@ -40,7 +40,7 @@ TEST(stat_allocator_test, host_memory_stat_test) { EXPECT_EQ(HostMemoryStatPeakValue("Allocated", 0), max_alloc_size); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) TEST(stat_allocator_test, device_memory_stat_test) { std::vector alloc_sizes{ 5278, 9593, 8492, 5041, 3351, 4232, 3706, 5963, 5896, 5057, 7527, diff --git a/paddle/fluid/operators/array_to_lod_tensor_op.cc b/paddle/fluid/operators/array_to_lod_tensor_op.cc index d1dc7d8986bec..fde5de90c56dc 100644 --- a/paddle/fluid/operators/array_to_lod_tensor_op.cc +++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc @@ -55,7 +55,7 @@ struct ArrayToLoDFunctor { if (std::is_same::value) { Apply(static_cast(pool.Get(place))); } else { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) Apply(static_cast(pool.Get(place))); #else PADDLE_THROW( diff --git a/paddle/fluid/operators/class_center_sample_op.cu b/paddle/fluid/operators/class_center_sample_op.cu index f63baadbde526..5327be6909b4f 100644 --- a/paddle/fluid/operators/class_center_sample_op.cu +++ b/paddle/fluid/operators/class_center_sample_op.cu @@ -12,13 +12,18 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) #include #include #include typedef hiprandState curandState; namespace cub = hipcub; + +#elif defined(PADDLE_WITH_MUSA) +#include +#include +#include #else #include #include @@ -67,11 +72,16 @@ __global__ void RandomSampleClassCenter(const int64_t n, size_t local_seed = (static_cast(seed) + 0x9E3779B9U + (static_cast(id) << 6U) + (static_cast(id) >> 2U)); -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) hiprand_init(local_seed, id, increment, &localState); CUDA_KERNEL_LOOP(i, n) { buffer[i] = static_cast(hiprand(&localState) % max_val); } +#elif defined(PADDLE_WITH_MUSA) + murand_init(local_seed, id, increment, &localState); + CUDA_KERNEL_LOOP(i, n) { + buffer[i] = static_cast(murand(&localState) % max_val); + } #else curand_init(local_seed, id, increment, &localState); CUDA_KERNEL_LOOP(i, n) { diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op.h b/paddle/fluid/operators/collective/c_sync_calc_stream_op.h index e100397924af5..79c32bc907045 100644 --- a/paddle/fluid/operators/collective/c_sync_calc_stream_op.h +++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.h @@ -39,7 +39,7 @@ template class CSyncCalcStreamKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32) +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && !defined(_WIN32) auto place = ctx.GetPlace(); auto dev_ctx = static_cast( diff --git a/paddle/fluid/operators/collective/c_wait_comm_op.cc b/paddle/fluid/operators/collective/c_wait_comm_op.cc index bacbe014a343c..f3a34f2c7d057 100644 --- a/paddle/fluid/operators/collective/c_wait_comm_op.cc +++ b/paddle/fluid/operators/collective/c_wait_comm_op.cc @@ -57,9 +57,12 @@ class CWaitCommOp : public framework::OperatorBase { platform::NCCLCommContext::Instance().Get(ring_id, place)->comm_event(); // comm_stream-->event-->compute_stream -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, comm_stream)); PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(compute_stream, event, 0)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event, comm_stream)); + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(compute_stream, event, 0)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, comm_stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(compute_stream, event, 0)); diff --git a/paddle/fluid/operators/collective/c_wait_compute_op.cc b/paddle/fluid/operators/collective/c_wait_compute_op.cc index 34569b0a4b600..4b9ca005be397 100644 --- a/paddle/fluid/operators/collective/c_wait_compute_op.cc +++ b/paddle/fluid/operators/collective/c_wait_compute_op.cc @@ -58,9 +58,12 @@ class CWaitComputeOp : public framework::OperatorBase { ->compute_event(); // compute_stream-->event-->comm_stream -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, compute_stream)); PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(comm_stream, event, 0)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event, compute_stream)); + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(comm_stream, event, 0)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, compute_stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(comm_stream, event, 0)); diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.h b/paddle/fluid/operators/controlflow/conditional_block_op.h index 0f04a295ed263..d5419d2b13a4e 100644 --- a/paddle/fluid/operators/controlflow/conditional_block_op.h +++ b/paddle/fluid/operators/controlflow/conditional_block_op.h @@ -77,7 +77,7 @@ class ConditionalOp : public framework::OperatorBase { ips[0]->numel())); bool res = false; if (platform::is_gpu_place(ips[0]->place())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) phi::DenseTensor cpu_tensor; framework::TensorCopy(*ips[0], platform::CPUPlace(), &cpu_tensor); platform::DeviceContextPool::Instance().Get(ips[0]->place())->Wait(); diff --git a/paddle/fluid/operators/controlflow/feed_op.cc b/paddle/fluid/operators/controlflow/feed_op.cc index c2deeb4190986..6b85a1d08657b 100644 --- a/paddle/fluid/operators/controlflow/feed_op.cc +++ b/paddle/fluid/operators/controlflow/feed_op.cc @@ -218,7 +218,7 @@ PD_REGISTER_KERNEL_FOR_ALL_DTYPE( ALL_LAYOUT, paddle::operators::FeedSparseCooTensorKernel) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL_FOR_ALL_DTYPE( feed_sparse_coo_tensor, GPU, diff --git a/paddle/fluid/operators/controlflow/get_places_op.cc b/paddle/fluid/operators/controlflow/get_places_op.cc index 9f67b1d4b6e18..1074c1c30f676 100644 --- a/paddle/fluid/operators/controlflow/get_places_op.cc +++ b/paddle/fluid/operators/controlflow/get_places_op.cc @@ -26,7 +26,7 @@ namespace imperative { class OpBase; } // namespace imperative } // namespace paddle -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/fluid/platform/device/gpu/gpu_info.h" #endif @@ -34,7 +34,7 @@ namespace paddle { namespace operators { static size_t CUDADevCount() { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) return platform::GetGPUDeviceCount(); #else return 0UL; diff --git a/paddle/fluid/operators/controlflow/while_op_helper.cc b/paddle/fluid/operators/controlflow/while_op_helper.cc index 6ae32f33e957a..790f54612ffae 100644 --- a/paddle/fluid/operators/controlflow/while_op_helper.cc +++ b/paddle/fluid/operators/controlflow/while_op_helper.cc @@ -227,7 +227,7 @@ bool GetCondData(const phi::DenseTensor &cond) { // when platform::is_gpu_place(cond.place()) or // platform::is_xpu_place(cond.place()) is true std::unique_ptr cpu_cond{new phi::DenseTensor()}; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \ defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE) framework::TensorCopySync(cond, platform::CPUPlace(), cpu_cond.get()); #else diff --git a/paddle/fluid/operators/detection/target_assign_op.h b/paddle/fluid/operators/detection/target_assign_op.h index 484bd8454bae9..0fd2a6883943b 100644 --- a/paddle/fluid/operators/detection/target_assign_op.h +++ b/paddle/fluid/operators/detection/target_assign_op.h @@ -120,7 +120,7 @@ class TargetAssignKernel : public framework::OpKernel { int64_t k = x->dims()[2]; auto x_lod = x->lod().back(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) phi::MixVector mixv_x_lod(&x_lod); size_t* x_lod_data = mixv_x_lod.MutableData(ctx.GetPlace()); #else @@ -137,7 +137,7 @@ class TargetAssignKernel : public framework::OpKernel { k, out_data, out_wt_data); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) mixv_x_lod.CopyToCPU(); #endif @@ -154,7 +154,7 @@ class TargetAssignKernel : public framework::OpKernel { "TargetAssignOp input(NegIndices) needs 1 level of LoD")); const int* neg_idx_data = neg_indices->data(); auto neg_lod = neg_indices->lod().back(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) phi::MixVector mixv_neg_lod(&neg_lod); size_t* neg_lod_data = mixv_neg_lod.MutableData(ctx.GetPlace()); #else @@ -170,7 +170,7 @@ class TargetAssignKernel : public framework::OpKernel { mismatch_value, out_data, out_wt_data); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) mixv_neg_lod.CopyToCPU(); #endif } diff --git a/paddle/fluid/operators/dgc_op.h b/paddle/fluid/operators/dgc_op.h index 45f34313d1a3d..f1d37e447991c 100644 --- a/paddle/fluid/operators/dgc_op.h +++ b/paddle/fluid/operators/dgc_op.h @@ -188,7 +188,7 @@ class DGCOpKernel : public framework::OpKernel { int buf_size = paddle::communication::dgc::get_buffer_size(k); paddle::memory::allocation::AllocationPtr tmp_ious_data; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (platform::is_gpu_place(dev_ctx.GetPlace())) { tmp_ious_data = memory::Alloc( dev_ctx.GetPlace(), diff --git a/paddle/fluid/operators/expand_as_op.cc b/paddle/fluid/operators/expand_as_op.cc index 107fe9f6174b6..f0d31269da193 100644 --- a/paddle/fluid/operators/expand_as_op.cc +++ b/paddle/fluid/operators/expand_as_op.cc @@ -155,7 +155,7 @@ REGISTER_OP_CPU_KERNEL(expand_as_grad, ops::ExpandAsGradKernel, ops::ExpandAsGradKernel, ops::ExpandAsGradKernel); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) REGISTER_OP_CUDA_KERNEL(expand_as, ops::ExpandAsKernel, ops::ExpandAsKernel, diff --git a/paddle/fluid/operators/expand_op.cc b/paddle/fluid/operators/expand_op.cc index fee4b47049301..490c6f9f6dbfc 100644 --- a/paddle/fluid/operators/expand_op.cc +++ b/paddle/fluid/operators/expand_op.cc @@ -283,7 +283,7 @@ REGISTER_OP_CPU_KERNEL(expand_grad, ops::ExpandGradKernel, ops::ExpandGradKernel, ops::ExpandGradKernel); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) REGISTER_OP_CUDA_KERNEL( expand, ops::ExpandKernel, diff --git a/paddle/fluid/operators/fake_quantize_op.cu.h b/paddle/fluid/operators/fake_quantize_op.cu.h index b6dd3ca8f64b2..1bedf6cc54a4e 100644 --- a/paddle/fluid/operators/fake_quantize_op.cu.h +++ b/paddle/fluid/operators/fake_quantize_op.cu.h @@ -190,8 +190,10 @@ struct FindChannelAbsMaxFunctor { int grid = cout; int max_threads = 1024; -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) hipMemset(out_abs_max, 0, sizeof(T) * cout); +#elif defined(PADDLE_WITH_MUSA) + musaMemset(out_abs_max, 0, sizeof(T) * cout); #else cudaMemset(out_abs_max, 0, sizeof(T) * cout); #endif // PADDLE_FLUID_OPERATORS_FAKE_QUANTIZE_OP_CU_H_ diff --git a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu index 35574331e17d7..0216564ed80a4 100644 --- a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu +++ b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu @@ -44,8 +44,10 @@ class EmbeddingEltWiseLayerNormKernel : public framework::OpKernel { framework::TransToPhiDataType(framework::proto::VarType::INT64)); framework::DDim in_dim{input_num}; int device_id; -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) hipGetDevice(&device_id); +#elif defined(PADDLE_WITH_MUSA) + musaGetDevice(&device_id); #else cudaGetDevice(&device_id); #endif @@ -65,7 +67,7 @@ class EmbeddingEltWiseLayerNormKernel : public framework::OpKernel { in1s.push_back(reinterpret_cast(ids[i]->data())); in2s.push_back(reinterpret_cast(embs[i]->data())); } -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) hipMemcpyAsync(in_ids_d, in1s.data(), sizeof(int64_t) * input_num, @@ -76,6 +78,17 @@ class EmbeddingEltWiseLayerNormKernel : public framework::OpKernel { sizeof(int64_t) * input_num, hipMemcpyHostToDevice, device_ctx.stream()); +#elif defined(PADDLE_WITH_MUSA) + musaMemcpyAsync(in_ids_d, + in1s.data(), + sizeof(int64_t) * input_num, + musaMemcpyHostToDevice, + device_ctx.stream()); + musaMemcpyAsync(in_embs_d, + in2s.data(), + sizeof(int64_t) * input_num, + musaMemcpyHostToDevice, + device_ctx.stream()); #else cudaMemcpyAsync(in_ids_d, in1s.data(), diff --git a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu index 32e7cffa4984b..35d69faa1a41d 100644 --- a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu +++ b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu @@ -31,6 +31,10 @@ limitations under the License. */ #include #include #endif +#ifdef PADDLE_WITH_MUSA +#include +#include +#endif #ifdef PADDLE_WITH_HIP #include #include diff --git a/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc index 8ae92b04b7df4..c6a8a4fe7b982 100644 --- a/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc +++ b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc @@ -111,7 +111,7 @@ PD_REGISTER_STRUCT_KERNEL(get_tensor_from_selected_rows, int, int64_t) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_STRUCT_KERNEL(get_tensor_from_selected_rows, GPU, ALL_LAYOUT, diff --git a/paddle/fluid/operators/graph_khop_sampler_op.cu b/paddle/fluid/operators/graph_khop_sampler_op.cu index e533960c8a648..7c9d1f3c921f7 100644 --- a/paddle/fluid/operators/graph_khop_sampler_op.cu +++ b/paddle/fluid/operators/graph_khop_sampler_op.cu @@ -29,9 +29,12 @@ limitations under the License. */ #include -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) #include #include +#elif defined(PADDLE_WITH_MUSA) +#include +#include #else #include #include @@ -89,12 +92,18 @@ __global__ void GraphSampleNeighborsCUDAKernel(const uint64_t rand_seed, int64_t out_row = blockIdx.x * TILE_SIZE + threadIdx.y; const int64_t last_row = min(static_cast(blockIdx.x + 1) * TILE_SIZE, num_rows); -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) hiprandState rng; hiprand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.y * WARP_SIZE + threadIdx.x, 0, &rng); +#elif defined(PADDLE_WITH_MUSA) + murandState rng; + murand_init(rand_seed * gridDim.x + blockIdx.x, + threadIdx.y * WARP_SIZE + threadIdx.x, + 0, + &rng); #else curandState rng; curand_init(rand_seed * gridDim.x + blockIdx.x, @@ -126,8 +135,10 @@ __global__ void GraphSampleNeighborsCUDAKernel(const uint64_t rand_seed, #endif for (int idx = k + threadIdx.x; idx < deg; idx += WARP_SIZE) { -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) const int num = hiprand(&rng) % (idx + 1); +#elif defined(PADDLE_WITH_MUSA) + const int num = murand(&rng) % (idx + 1); #else const int num = curand(&rng) % (idx + 1); #endif diff --git a/paddle/fluid/operators/hinge_loss_op.cc b/paddle/fluid/operators/hinge_loss_op.cc index dea3ce3fe695b..ea38db87e63e7 100644 --- a/paddle/fluid/operators/hinge_loss_op.cc +++ b/paddle/fluid/operators/hinge_loss_op.cc @@ -156,7 +156,7 @@ PD_REGISTER_STRUCT_KERNEL( PD_REGISTER_STRUCT_KERNEL( hinge_loss_grad, CPU, ALL_LAYOUT, ops::HingeLossGradKernel, float) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_STRUCT_KERNEL( hinge_loss, GPU, ALL_LAYOUT, ops::HingeLossKernel, float) {} PD_REGISTER_STRUCT_KERNEL( diff --git a/paddle/fluid/operators/im2sequence_op.cc b/paddle/fluid/operators/im2sequence_op.cc index 8c123bb8a32f2..e1e9ca5ef6667 100644 --- a/paddle/fluid/operators/im2sequence_op.cc +++ b/paddle/fluid/operators/im2sequence_op.cc @@ -201,7 +201,7 @@ PD_REGISTER_STRUCT_KERNEL( PD_REGISTER_STRUCT_KERNEL( im2sequence_grad, CPU, ALL_LAYOUT, ops::Im2SequenceGradKernel, float) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_STRUCT_KERNEL( im2sequence, GPU, ALL_LAYOUT, ops::Im2SequenceKernel, float) {} PD_REGISTER_STRUCT_KERNEL( diff --git a/paddle/fluid/operators/isfinite_op.h b/paddle/fluid/operators/isfinite_op.h index aab7953d6d103..940b3eaac0c10 100644 --- a/paddle/fluid/operators/isfinite_op.h +++ b/paddle/fluid/operators/isfinite_op.h @@ -67,7 +67,7 @@ bool TensorIsfinite(const phi::DenseTensor& tensor); FiniteVisitor(Isnan, Any, CPU); FiniteVisitor(Isinf, Any, CPU); FiniteVisitor(Isfinite, All, CPU); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) FiniteVisitor(Isnan, Any, GPU); FiniteVisitor(Isinf, Any, GPU); FiniteVisitor(Isfinite, All, GPU); @@ -82,7 +82,7 @@ inline void TensorContainsNAN(const phi::DenseTensor& tensor, IsnanVisitorCPU(tensor, out)); return; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (platform::is_gpu_place(place)) { VisitDataTypeNormal(paddle::framework::TransToProtoVarType(tensor.dtype()), IsnanVisitorGPU(tensor, out)); @@ -99,7 +99,7 @@ inline void TensorContainsInf(const phi::DenseTensor& tensor, IsinfVisitorCPU(tensor, out)); return; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (platform::is_gpu_place(place)) { VisitDataTypeNormal(paddle::framework::TransToProtoVarType(tensor.dtype()), IsinfVisitorGPU(tensor, out)); @@ -116,7 +116,7 @@ inline void TensorIsfinite(const phi::DenseTensor& tensor, IsfiniteVisitorCPU(tensor, out)); return; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (platform::is_gpu_place(place)) { VisitDataTypeNormal(paddle::framework::TransToProtoVarType(tensor.dtype()), IsfiniteVisitorGPU(tensor, out)); diff --git a/paddle/fluid/operators/l1_norm_op.cc b/paddle/fluid/operators/l1_norm_op.cc index 92f190c0025ed..2c6d72f109c13 100644 --- a/paddle/fluid/operators/l1_norm_op.cc +++ b/paddle/fluid/operators/l1_norm_op.cc @@ -96,7 +96,7 @@ PD_REGISTER_STRUCT_KERNEL(l1_norm, CPU, ALL_LAYOUT, ops::L1NormKernel, float) {} PD_REGISTER_STRUCT_KERNEL( l1_norm_grad, CPU, ALL_LAYOUT, ops::L1NormGradKernel, float) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_STRUCT_KERNEL(l1_norm, GPU, ALL_LAYOUT, ops::L1NormKernel, float) {} PD_REGISTER_STRUCT_KERNEL( l1_norm_grad, GPU, ALL_LAYOUT, ops::L1NormGradKernel, float) {} diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc index dd85ccff87f2d..197aaa74bb3e1 100644 --- a/paddle/fluid/operators/load_op.cc +++ b/paddle/fluid/operators/load_op.cc @@ -133,7 +133,7 @@ PD_REGISTER_KERNEL(load, CPU, ALL_LAYOUT, ops::LoadKernel, float) {} PD_REGISTER_KERNEL( load_sr, CPU, ALL_LAYOUT, ops::LoadSelectedRowsKernel, float) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(load, GPU, ALL_LAYOUT, ops::LoadKernel, float) {} PD_REGISTER_KERNEL( load_sr, GPU, ALL_LAYOUT, ops::LoadSelectedRowsKernel, float) {} diff --git a/paddle/fluid/operators/lod_tensor_to_array_op.cc b/paddle/fluid/operators/lod_tensor_to_array_op.cc index 94b0319729117..da8ea875e9393 100644 --- a/paddle/fluid/operators/lod_tensor_to_array_op.cc +++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc @@ -66,7 +66,7 @@ struct LoDTensorToArrayFunctor { if (std::is_same::value) { Apply(static_cast(dev_ctx)); } else { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) Apply(static_cast(dev_ctx)); #else PADDLE_THROW( diff --git a/paddle/fluid/operators/lookup_table_v2_op.cu b/paddle/fluid/operators/lookup_table_v2_op.cu index 11c35293ebe34..c627b1cf89dcd 100644 --- a/paddle/fluid/operators/lookup_table_v2_op.cu +++ b/paddle/fluid/operators/lookup_table_v2_op.cu @@ -218,9 +218,12 @@ struct LookupTableV2GradCUDAFunctor { const auto *ids = ids_t_->template data(); T *d_table = d_table_t->mutable_data(context_.GetPlace()); -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_GPU_SUCCESS( hipMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx.stream())); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx.stream())); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx.stream())); diff --git a/paddle/fluid/operators/margin_cross_entropy_op.cu b/paddle/fluid/operators/margin_cross_entropy_op.cu index d741bc5b42549..40231fd4bf2c4 100644 --- a/paddle/fluid/operators/margin_cross_entropy_op.cu +++ b/paddle/fluid/operators/margin_cross_entropy_op.cu @@ -13,9 +13,11 @@ // limitations under the License. // old op include, fluid should be removed -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) #include namespace cub = hipcub; +#if defined(PADDLE_WITH_MUSA) +#include #else #include #endif diff --git a/paddle/fluid/operators/math/bert_encoder_functor.h b/paddle/fluid/operators/math/bert_encoder_functor.h index 9a0b5a1ae3ab7..a9869e5faecce 100644 --- a/paddle/fluid/operators/math/bert_encoder_functor.h +++ b/paddle/fluid/operators/math/bert_encoder_functor.h @@ -47,7 +47,7 @@ struct CUDATypeTraits { typedef float TYPE; }; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) // This functor involves a fusion calculation in Ernie or Bert. // The fusion mode is as follows: // diff --git a/paddle/fluid/operators/math/prelu.h b/paddle/fluid/operators/math/prelu.h index 00ff1fbcbc38d..1762353abaa9f 100644 --- a/paddle/fluid/operators/math/prelu.h +++ b/paddle/fluid/operators/math/prelu.h @@ -23,7 +23,7 @@ namespace paddle { namespace operators { namespace math { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) template class PreluChannelWiseDirectCUDAFunctor { public: diff --git a/paddle/fluid/operators/math/sample_prob.h b/paddle/fluid/operators/math/sample_prob.h index 7c60be6841552..b7b224a0baaf5 100644 --- a/paddle/fluid/operators/math/sample_prob.h +++ b/paddle/fluid/operators/math/sample_prob.h @@ -106,7 +106,7 @@ class SampleWithProb { } }; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) template class GPUSampleWithProb { public: diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc index e1a36fa41894d..5208d0b2cf937 100644 --- a/paddle/fluid/operators/matmul_op.cc +++ b/paddle/fluid/operators/matmul_op.cc @@ -926,7 +926,7 @@ REGISTER_OP_CPU_KERNEL(matmul_grad_grad, ops::MatMulDoubleGradKernel, ops::MatMulDoubleGradKernel); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) REGISTER_OP_CUDA_KERNEL( matmul, ops::MatMulKernel, diff --git a/paddle/fluid/operators/memcpy_h2d_op.h b/paddle/fluid/operators/memcpy_h2d_op.h index 5f480461d77cd..a4b6e061bfdff 100644 --- a/paddle/fluid/operators/memcpy_h2d_op.h +++ b/paddle/fluid/operators/memcpy_h2d_op.h @@ -39,7 +39,7 @@ class MemcpyH2DFunctor { void operator()(const phi::DenseTensor &lod_tensor) const { auto &out_tensor = *out_->GetMutable(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) auto stream = static_cast(&dev_ctx_)->stream(); #else auto stream = nullptr; diff --git a/paddle/fluid/operators/merge_lod_tensor_op.cc b/paddle/fluid/operators/merge_lod_tensor_op.cc index 007f853f3243f..20775d02aadfe 100644 --- a/paddle/fluid/operators/merge_lod_tensor_op.cc +++ b/paddle/fluid/operators/merge_lod_tensor_op.cc @@ -68,7 +68,7 @@ class MergeLoDTensorOp : public framework::OperatorBase { if (platform::is_cpu_place(mask.place())) { cpu_mask->ShareDataWith(mask); } else if (platform::is_gpu_place(mask.place())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) framework::TensorCopy( mask, platform::CPUPlace(), dev_ctx, cpu_mask.get()); #else diff --git a/paddle/fluid/operators/minus_op.cc b/paddle/fluid/operators/minus_op.cc index 8c33a5da1baff..27a38571e1c80 100644 --- a/paddle/fluid/operators/minus_op.cc +++ b/paddle/fluid/operators/minus_op.cc @@ -157,6 +157,6 @@ REGISTER_OPERATOR(minus, ops::MinusGradMaker); PD_REGISTER_STRUCT_KERNEL(minus, CPU, ALL_LAYOUT, ops::MinusKernel, float) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_STRUCT_KERNEL(minus, GPU, ALL_LAYOUT, ops::MinusKernel, float) {} #endif diff --git a/paddle/fluid/operators/nop_op.cc b/paddle/fluid/operators/nop_op.cc index 69f0bfb2abcd3..e99b3956d05b0 100644 --- a/paddle/fluid/operators/nop_op.cc +++ b/paddle/fluid/operators/nop_op.cc @@ -60,6 +60,6 @@ REGISTER_OP_WITHOUT_GRADIENT(nop, ops::NopOp, ops::NopOpMaker); PD_REGISTER_STRUCT_KERNEL(nop, CPU, ALL_LAYOUT, ops::NopKernel, float) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_STRUCT_KERNEL(nop, GPU, ALL_LAYOUT, ops::NopKernel, float) {} #endif diff --git a/paddle/fluid/operators/pad_constant_like_op.cc b/paddle/fluid/operators/pad_constant_like_op.cc index d00cefab45045..72061fbc39630 100644 --- a/paddle/fluid/operators/pad_constant_like_op.cc +++ b/paddle/fluid/operators/pad_constant_like_op.cc @@ -260,7 +260,7 @@ PD_REGISTER_STRUCT_KERNEL(pad_constant_like_grad, int, int64_t) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_STRUCT_KERNEL(pad_constant_like, GPU, ALL_LAYOUT, diff --git a/paddle/fluid/operators/pscore/send_and_recv_op.cc b/paddle/fluid/operators/pscore/send_and_recv_op.cc index 99e8d04a9e329..49623bb0ec206 100644 --- a/paddle/fluid/operators/pscore/send_and_recv_op.cc +++ b/paddle/fluid/operators/pscore/send_and_recv_op.cc @@ -107,7 +107,7 @@ PD_REGISTER_STRUCT_KERNEL(send_and_recv, double, int, int64_t) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_STRUCT_KERNEL(send_and_recv, GPU, ALL_LAYOUT, diff --git a/paddle/fluid/operators/random_crop_op.h b/paddle/fluid/operators/random_crop_op.h index fc625826b9a91..de03079b23035 100644 --- a/paddle/fluid/operators/random_crop_op.h +++ b/paddle/fluid/operators/random_crop_op.h @@ -19,7 +19,7 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/for_range.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include #endif @@ -37,7 +37,7 @@ struct Random { using UniformIntDist = std::uniform_int_distribution; }; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) template <> struct Random { using Engine = thrust::minstd_rand; diff --git a/paddle/fluid/operators/rank_loss_op.cc b/paddle/fluid/operators/rank_loss_op.cc index ebdddfd41b33f..b9f05d663dba0 100644 --- a/paddle/fluid/operators/rank_loss_op.cc +++ b/paddle/fluid/operators/rank_loss_op.cc @@ -246,7 +246,7 @@ PD_REGISTER_STRUCT_KERNEL( PD_REGISTER_STRUCT_KERNEL( rank_loss_grad, CPU, ALL_LAYOUT, ops::RankLossGradKernel, float) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_STRUCT_KERNEL( rank_loss, GPU, ALL_LAYOUT, ops::RankLossKernel, float) {} PD_REGISTER_STRUCT_KERNEL( diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc index a0ad7e3939a02..73b3823d3e5ab 100644 --- a/paddle/fluid/operators/reader/buffered_reader.cc +++ b/paddle/fluid/operators/reader/buffered_reader.cc @@ -48,7 +48,7 @@ BufferedReader::BufferedReader( buffer_size_(buffer_size), pin_memory_(pin_memory) { VLOG(1) << "BufferedReader"; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (platform::is_gpu_place(place_) && !pin_memory) { int dev_idx = place_.device; compute_stream_ = @@ -118,7 +118,7 @@ void BufferedReader::ReadAsync(size_t i) { return -1UL; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // @{ Group GPU Place +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) // @{ Group GPU Place if (platform::is_gpu_place(place_)) { TensorVec &cuda = cuda_buffer_[i]; if (cuda.empty()) { diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h index 032a74b7e23f1..db849dc70b5da 100644 --- a/paddle/fluid/operators/reader/buffered_reader.h +++ b/paddle/fluid/operators/reader/buffered_reader.h @@ -21,7 +21,7 @@ #include "ThreadPool.h" #include "paddle/fluid/framework/reader.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h" #endif @@ -80,7 +80,7 @@ class BufferedReader : public framework::DecoratedReader { std::vector xpu_buffer_; std::vector custom_device_buffer_; size_t prev_pos_{-1UL}; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) gpuStream_t compute_stream_; std::shared_ptr stream_; std::vector> events_; diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index 962b18c995979..a089ad7d58fac 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -429,7 +429,7 @@ class ReshapeKernel { pt_scalar_shape, out); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (platform::is_gpu_place(ctx.GetPlace())) { auto &dev_ctx = ctx.device_context(); phi::ReshapeInferKernel(static_cast(dev_ctx), @@ -462,7 +462,7 @@ class ReshapeGradKernel { phi::ReshapeGradKernel( static_cast(dev_ctx), *d_out, d_x); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (platform::is_gpu_place(ctx.GetPlace())) { auto &dev_ctx = ctx.device_context(); phi::ReshapeGradKernel( @@ -492,7 +492,7 @@ class ReshapeDoubleGradKernel { phi::ReshapeDoubleGradKernel( static_cast(dev_ctx), *d_out, *dd_x, dd_out); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (platform::is_gpu_place(ctx.GetPlace())) { auto &dev_ctx = ctx.device_context(); phi::ReshapeDoubleGradKernel( @@ -761,7 +761,7 @@ REGISTER_OPERATOR(reshape2_grad_grad, ops::ReshapeDoubleGradOpNoNeedBufferVarInferer, Reshape2DoubleGradInferShapeFunctor); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc index bc1f5a0d34f60..ab03d46486c2e 100644 --- a/paddle/fluid/operators/save_op.cc +++ b/paddle/fluid/operators/save_op.cc @@ -117,7 +117,7 @@ PD_REGISTER_KERNEL(save_sr, phi::dtype::float16, phi::dtype::bfloat16) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(save, GPU, ALL_LAYOUT, diff --git a/paddle/fluid/operators/select_op_helper.h b/paddle/fluid/operators/select_op_helper.h index 2b7f884f6170c..7e3de57345a4b 100644 --- a/paddle/fluid/operators/select_op_helper.h +++ b/paddle/fluid/operators/select_op_helper.h @@ -39,7 +39,7 @@ inline int GetBranchNumber(const phi::DenseTensor &mask) { } // when platform::is_gpu_place(mask.place()) is true std::unique_ptr cpu_mask{new phi::DenseTensor()}; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \ defined(PADDLE_WITH_CUSTOM_DEVICE) || defined(PADDLE_WITH_XPU) framework::TensorCopySync(mask, platform::CPUPlace(), cpu_mask.get()); #else diff --git a/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h b/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h index 2236988025cbc..13133e54f0415 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h +++ b/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h @@ -136,7 +136,7 @@ class SequenceReverseOpKernel : public framework::OpKernel { const size_t *lod; size_t lod_count = x.lod()[0].size(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (platform::is_gpu_place(ctx.GetPlace())) { auto xlod = x.lod()[0]; phi::MixVector mixv_xlod(&xlod); @@ -144,7 +144,7 @@ class SequenceReverseOpKernel : public framework::OpKernel { } else { #endif lod = x.lod()[0].data(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) } #endif diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc index 0ca5514900d46..e3af25c4b57f9 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc @@ -16,7 +16,7 @@ limitations under the License. */ #include -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #endif diff --git a/paddle/fluid/operators/shuffle_batch_op.cu b/paddle/fluid/operators/shuffle_batch_op.cu index 5069cf1e512cb..7c8c6ca475b38 100644 --- a/paddle/fluid/operators/shuffle_batch_op.cu +++ b/paddle/fluid/operators/shuffle_batch_op.cu @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #ifndef _MSC_VER #include diff --git a/paddle/fluid/operators/split_lod_tensor_op.cc b/paddle/fluid/operators/split_lod_tensor_op.cc index e648575a1edca..607ea43f50105 100644 --- a/paddle/fluid/operators/split_lod_tensor_op.cc +++ b/paddle/fluid/operators/split_lod_tensor_op.cc @@ -69,7 +69,7 @@ class SplitLoDTensorOp : public framework::OperatorBase { if (platform::is_cpu_place(mask.place())) { cpu_mask->ShareDataWith(mask); } else if (platform::is_gpu_place(mask.place())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) framework::TensorCopy( mask, platform::CPUPlace(), dev_ctx, cpu_mask.get()); #else diff --git a/paddle/fluid/operators/sync_batch_norm_op.cu b/paddle/fluid/operators/sync_batch_norm_op.cu index 84e30250f85fd..fda42c80fbbf6 100644 --- a/paddle/fluid/operators/sync_batch_norm_op.cu +++ b/paddle/fluid/operators/sync_batch_norm_op.cu @@ -302,7 +302,25 @@ void SyncBatchNormCooGradKernel( } // namespace sparse } // namespace phi -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) +PD_REGISTER_KERNEL(sync_batch_norm, + GPU, + ALL_LAYOUT, + phi::SyncBatchNormKernel, + float, + phi::dtype::float16) { + if (kernel_key.dtype() == phi::DataType::FLOAT16) { + kernel->InputAt(1).SetDataType(phi::DataType::FLOAT32); + kernel->InputAt(2).SetDataType(phi::DataType::FLOAT32); + kernel->InputAt(3).SetDataType(phi::DataType::FLOAT32); + kernel->InputAt(4).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32); + } +} +#elif defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(sync_batch_norm, GPU, ALL_LAYOUT, @@ -364,7 +382,19 @@ PD_REGISTER_KERNEL(sync_batch_norm, #endif #endif -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) +PD_REGISTER_KERNEL(sync_batch_norm_grad, + GPU, + ALL_LAYOUT, + phi::SyncBatchNormGradKernel, + float, + phi::dtype::float16) { + if (kernel_key.dtype() == phi::DataType::FLOAT16) { + kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); // scale_grad + kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); // bias_grad + } +} +#elif defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(sync_batch_norm_grad, GPU, ALL_LAYOUT, @@ -397,13 +427,19 @@ PD_REGISTER_KERNEL(sync_batch_norm_grad, #endif #endif -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) PD_REGISTER_KERNEL(sync_batch_norm_coo, GPU, ALL_LAYOUT, phi::sparse::SyncBatchNormCooKernel, float, phi::dtype::float16) {} +#elif defined(PADDLE_WITH_MUSA) +PD_REGISTER_KERNEL(sync_batch_norm_coo, + GPU, + ALL_LAYOUT, + phi::sparse::SyncBatchNormCooKernel, + float, #else PD_REGISTER_KERNEL(sync_batch_norm_coo, GPU, @@ -414,13 +450,19 @@ PD_REGISTER_KERNEL(sync_batch_norm_coo, phi::dtype::float16) {} #endif -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) PD_REGISTER_KERNEL(sync_batch_norm_coo_grad, GPU, ALL_LAYOUT, phi::sparse::SyncBatchNormCooGradKernel, float, phi::dtype::float16) {} +#elif defined(PADDLE_WITH_MUSA) +PD_REGISTER_KERNEL(sync_batch_norm_coo_grad, + GPU, + ALL_LAYOUT, + phi::sparse::SyncBatchNormCooGradKernel, + float, #else PD_REGISTER_KERNEL(sync_batch_norm_coo_grad, GPU, diff --git a/paddle/fluid/platform/complex_test.cu b/paddle/fluid/platform/complex_test.cu index b814bcde6841f..effccd3cce75e 100644 --- a/paddle/fluid/platform/complex_test.cu +++ b/paddle/fluid/platform/complex_test.cu @@ -27,7 +27,7 @@ #include "paddle/fluid/platform/enforce.h" #include "paddle/phi/kernels/funcs/eigen/extensions.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) namespace paddle { namespace platform { diff --git a/paddle/fluid/platform/device/device_wrapper.h b/paddle/fluid/platform/device/device_wrapper.h index aa2dba03c9082..bcfb316837a30 100644 --- a/paddle/fluid/platform/device/device_wrapper.h +++ b/paddle/fluid/platform/device/device_wrapper.h @@ -16,7 +16,7 @@ limitations under the License. */ #pragma once -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/fluid/platform/device/gpu/gpu_info.h" #endif diff --git a/paddle/fluid/platform/device/gpu/gpu_dnn.h b/paddle/fluid/platform/device/gpu/gpu_dnn.h index f6f6392c4c23d..2a9db61f83bc6 100644 --- a/paddle/fluid/platform/device/gpu/gpu_dnn.h +++ b/paddle/fluid/platform/device/gpu/gpu_dnn.h @@ -16,7 +16,7 @@ #include "paddle/phi/backends/gpu/gpu_dnn.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) namespace paddle { namespace platform { diff --git a/paddle/fluid/platform/device/gpu/gpu_helper.h b/paddle/fluid/platform/device/gpu/gpu_helper.h index 878a122a49224..7fde4429bb7f3 100644 --- a/paddle/fluid/platform/device/gpu/gpu_helper.h +++ b/paddle/fluid/platform/device/gpu/gpu_helper.h @@ -13,7 +13,7 @@ // limitations under the License. #pragma once -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #ifdef PADDLE_WITH_HIP #include "paddle/fluid/platform/device/gpu/rocm/rocm_helper.h" diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc index 7f1f2c76bd630..94c85105115d6 100644 --- a/paddle/fluid/platform/device/gpu/gpu_info.cc +++ b/paddle/fluid/platform/device/gpu/gpu_info.cc @@ -216,6 +216,8 @@ class RecordedGpuMallocHelper { } else { result = hipMalloc(ptr, size); } +#elif defined(PADDLE_WITH_MUSA) + result = musaMalloc(ptr, size); #else phi::backends::gpu::CUDAGraphCaptureModeGuard capture_mode_guard; if (UNLIKELY(malloc_managed_memory)) { @@ -262,6 +264,9 @@ class RecordedGpuMallocHelper { #ifdef PADDLE_WITH_HIP auto err = hipFree(ptr); if (err != hipErrorDeinitialized) { +#elif define(PADDLE_WITH_MUSA) + auto err = musaFree(ptr); + if (err != musaErrorMusaUnloading) { #else auto err = cudaFree(ptr); VLOG(10) << "[cudaFree] size=" << static_cast(size) / (1 << 20) @@ -309,6 +314,8 @@ class RecordedGpuMallocHelper { CUDADeviceGuard guard(dev_id_); #ifdef PADDLE_WITH_HIP auto result = hipMemGetInfo(actual_avail, actual_total); +#elif define(PADDLE_WITH_MUSA) + auto result = musaMemGetInfo(actual_avail, actual_total); #else auto result = cudaMemGetInfo(actual_avail, actual_total); #endif diff --git a/paddle/fluid/platform/device/gpu/gpu_info.h b/paddle/fluid/platform/device/gpu/gpu_info.h index de68329bba66d..64cb1bd8fcab7 100644 --- a/paddle/fluid/platform/device/gpu/gpu_info.h +++ b/paddle/fluid/platform/device/gpu/gpu_info.h @@ -11,7 +11,7 @@ limitations under the License. */ #pragma once -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include diff --git a/paddle/fluid/platform/device/gpu/gpu_launch_config.h b/paddle/fluid/platform/device/gpu/gpu_launch_config.h index d253a92c986ce..8ce858b4d37a1 100644 --- a/paddle/fluid/platform/device/gpu/gpu_launch_config.h +++ b/paddle/fluid/platform/device/gpu/gpu_launch_config.h @@ -16,7 +16,7 @@ #pragma once -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #ifdef PADDLE_WITH_CUDA #include diff --git a/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc b/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc index 9f2168e1cdb8b..ee60040f09074 100644 --- a/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc +++ b/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" diff --git a/paddle/fluid/platform/device/gpu/gpu_resource_pool.h b/paddle/fluid/platform/device/gpu/gpu_resource_pool.h index 2ac13e692f783..ff1452153e7bd 100644 --- a/paddle/fluid/platform/device/gpu/gpu_resource_pool.h +++ b/paddle/fluid/platform/device/gpu/gpu_resource_pool.h @@ -14,7 +14,7 @@ #pragma once -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #ifdef PADDLE_WITH_CUDA #include diff --git a/paddle/fluid/platform/device/gpu/gpu_types.h b/paddle/fluid/platform/device/gpu/gpu_types.h index c9afafdef7166..83497a2507005 100644 --- a/paddle/fluid/platform/device/gpu/gpu_types.h +++ b/paddle/fluid/platform/device/gpu/gpu_types.h @@ -15,7 +15,7 @@ #pragma once -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #ifdef PADDLE_WITH_HIP #include @@ -95,6 +95,9 @@ using CUDAGraphID = unsigned long long; // NOLINT #ifdef PADDLE_WITH_HIP #define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \ constexpr auto GPU_CV = ROCM_CV; +#elif PADDLE_WITH_MUSA +#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \ + constexpr auto GPU_CV = MUSA_CV; #else // CDUA #define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \ @@ -103,9 +106,10 @@ using CUDAGraphID = unsigned long long; // NOLINT DECLARE_CONSTANT_FOR_GPU(gpuErrorOutOfMemory, cudaErrorMemoryAllocation, - hipErrorOutOfMemory); -DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady, cudaErrorNotReady, hipErrorNotReady); -DECLARE_CONSTANT_FOR_GPU(gpuSuccess, cudaSuccess, hipSuccess); + hipErrorOutOfMemory, + musaErrorMemoryAllocation); +DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady, cudaErrorNotReady, hipErrorNotReady, musaErrorNotReady); +DECLARE_CONSTANT_FOR_GPU(gpuSuccess, cudaSuccess, hipSuccess, musaSuccess); #undef DECLARE_CONSTANT_FOR_GPU } // namespace paddle diff --git a/paddle/fluid/platform/device_code_test.cc b/paddle/fluid/platform/device_code_test.cc index 6b58453f03ea8..a4a810b34c3f0 100644 --- a/paddle/fluid/platform/device_code_test.cc +++ b/paddle/fluid/platform/device_code_test.cc @@ -45,7 +45,7 @@ void saxpy_kernel(float a, float *x, float* y, float* z, size_t n) { )"; #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) TEST(DeviceCode, cuda) { if (!phi::dynload::HasNVRTC() || !phi::dynload::HasCUDADriver()) { return; diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 456abd55ef68f..4a81291815373 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -27,7 +27,7 @@ limitations under the License. */ #include "paddle/phi/core/expect.h" #include "paddle/phi/core/generator.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h" #include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/phi/backends/gpu/gpu_context.h" @@ -53,7 +53,7 @@ DeviceType Place2DeviceType(const platform::Place& place) { } } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) template typename std::enable_if::value, DevCtx*>::type @@ -86,7 +86,7 @@ inline std::unique_ptr CreateDeviceContext( DevCtx* dev_ctx = ConstructDevCtx(p, stream_priority); auto& instance = paddle::memory::allocation::AllocatorFacade::Instance(); if (p.GetType() == phi::AllocationType::GPU) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) auto* cuda_ctx = dynamic_cast(dev_ctx); PADDLE_ENFORCE_NOT_NULL( cuda_ctx, @@ -172,7 +172,7 @@ void EmplaceDeviceContexts( /*unused*/ stream_priority); #endif } else if (place.GetType() == phi::AllocationType::GPU) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) EmplaceDeviceContext( place_to_device_context, place, @@ -209,7 +209,7 @@ void EmplaceDeviceContexts( "option.")); #endif } else if (platform::is_cuda_pinned_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) EmplaceDeviceContext( place_to_device_context, place, diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index b07b3f29dafde..2aa336486308d 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -136,7 +136,7 @@ namespace xpu = baidu::xpu::api; using XPUDeviceContext = phi::XPUContext; #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) using CUDAPinnedDeviceContext = phi::GPUPinnedContext; #endif @@ -165,7 +165,7 @@ struct DefaultDeviceContextType { }; #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) template <> struct DefaultDeviceContextType { using TYPE = paddle::platform::CUDAPinnedDeviceContext; diff --git a/paddle/fluid/platform/device_event.h b/paddle/fluid/platform/device_event.h index 402974b89e5c9..cb43f00f7fe0f 100644 --- a/paddle/fluid/platform/device_event.h +++ b/paddle/fluid/platform/device_event.h @@ -31,7 +31,7 @@ using ::paddle::platform::kXPU; USE_EVENT(kCPU) USE_EVENT_WAIT(kCPU, kCPU) -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) USE_EVENT(kCUDA); USE_EVENT_WAIT(kCUDA, kCUDA) USE_EVENT_WAIT(kCPU, kCUDA) diff --git a/paddle/fluid/platform/device_event_gpu.cc b/paddle/fluid/platform/device_event_gpu.cc index 37da8daf7fd69..09861f41874cd 100644 --- a/paddle/fluid/platform/device_event_gpu.cc +++ b/paddle/fluid/platform/device_event_gpu.cc @@ -15,7 +15,7 @@ #include "paddle/fluid/platform/device_event_base.h" #include "paddle/fluid/platform/event.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) namespace paddle { namespace platform { struct CUDADeviceEventWrapper { diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 425d4939b565f..105c5f0607f69 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -98,7 +98,7 @@ limitations under the License. */ #include "paddle/fluid/imperative/type_defs.h" #include "paddle/phi/core/enforce.h" // Note: this header for simplify HIP and CUDA type string -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/fluid/platform/device/gpu/gpu_types.h" #endif #include "paddle/phi/core/flags.h" diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc index 9fc200ca82f1c..ef435721b93a0 100644 --- a/paddle/fluid/platform/enforce_test.cc +++ b/paddle/fluid/platform/enforce_test.cc @@ -345,7 +345,7 @@ TEST(EOF_EXCEPTION, THROW_EOF) { EXPECT_TRUE(caught_eof); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) template bool CheckCudaStatusSuccess(T value, const std::string& msg = "success") { PADDLE_ENFORCE_GPU_SUCCESS(value); diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index b5f31fd85847c..fad64a6290486 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -18,7 +18,7 @@ limitations under the License. */ #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/string/split.h" #include "paddle/phi/backends/cpu/cpu_info.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" #endif @@ -172,7 +172,7 @@ void InitDevices() { #endif /*Init all available devices by default */ std::vector devices; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) try { // use user specified GPUs in single-node multi-process mode. devices = platform::GetSelectedDevices(); @@ -215,7 +215,7 @@ void InitDevices(const std::vector devices) { continue; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) places.emplace_back(platform::CUDAPlace(devices[i])); #endif #ifdef PADDLE_WITH_XPU @@ -226,7 +226,7 @@ void InitDevices(const std::vector devices) { #endif } places.emplace_back(platform::CPUPlace()); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) places.emplace_back(platform::CUDAPinnedPlace()); #endif #ifdef PADDLE_WITH_CUSTOM_DEVICE @@ -448,7 +448,7 @@ void InitMemoryMethod() { memory_method->copy = paddle::memory::Copy; memory_method->device_memory_stat_current_value = paddle::memory::DeviceMemoryStatCurrentValue; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) memory_method->gpu_memory_usage = paddle::platform::GpuMemoryUsage; #endif memory_method->emplace_device_contexts = diff --git a/paddle/fluid/platform/init_test.cc b/paddle/fluid/platform/init_test.cc index 66fb431af29e9..b643e37765668 100644 --- a/paddle/fluid/platform/init_test.cc +++ b/paddle/fluid/platform/init_test.cc @@ -32,7 +32,7 @@ TEST(InitDevices, CUDA) { using paddle::framework::InitDevices; using paddle::platform::DeviceContextPool; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) int count = paddle::platform::GetGPUDeviceCount(); InitDevices(); DeviceContextPool& pool = DeviceContextPool::Instance(); diff --git a/paddle/fluid/platform/place.h b/paddle/fluid/platform/place.h index 959379260419d..9e00bd589dc70 100644 --- a/paddle/fluid/platform/place.h +++ b/paddle/fluid/platform/place.h @@ -57,7 +57,7 @@ typename Visitor::result_type VisitPlace(const Place &place, const Visitor &visitor) { switch (place.GetType()) { case phi::AllocationType::GPU: { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) platform::CUDAPlace p(place.GetDeviceId()); return visitor(p); #else @@ -67,7 +67,7 @@ typename Visitor::result_type VisitPlace(const Place &place, #endif } case phi::AllocationType::GPUPINNED: { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) platform::CUDAPinnedPlace p; return visitor(p); #else diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 2c65023988dc6..d1b557922af32 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -698,7 +698,7 @@ void EnableProfiler(ProfilerState state) { HostTraceLevel::GetInstance().SetLevel(option.trace_level); should_send_profile_state = true; phi::GetDeviceTracer()->Enable(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (phi::ProfilerHelper::g_state == ProfilerState::kCUDA || phi::ProfilerHelper::g_state == ProfilerState::kAll || phi::ProfilerHelper::g_state == ProfilerState::kCPU) { diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h index c71b5a0e49104..7a13582736a50 100644 --- a/paddle/fluid/platform/profiler.h +++ b/paddle/fluid/platform/profiler.h @@ -31,7 +31,7 @@ limitations under the License. */ #include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/fluid/platform/profiler/mem_tracing.h" #include "paddle/fluid/platform/profiler/supplement_tracing.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/fluid/platform/device/gpu/gpu_info.h" #endif @@ -197,7 +197,7 @@ std::string OpName(const framework::VariableNameMap& name_map, const std::string& type_name); void SetTracerOption(TracerOption option); platform::TracerOption GetTracerOption(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) void DummyKernelAndEvent(); #endif diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.cc b/paddle/fluid/platform/profiler/chrometracing_logger.cc index e3fe83c5a74d2..8fa4d8a483c4d 100644 --- a/paddle/fluid/platform/profiler/chrometracing_logger.cc +++ b/paddle/fluid/platform/profiler/chrometracing_logger.cc @@ -561,7 +561,7 @@ void ChromeTracingLogger::LogMetaInfo(const std::string& version, span_indx); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) void ChromeTracingLogger::LogDeviceProperty( const std::map& device_property_map) { // add device property information diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.h b/paddle/fluid/platform/profiler/chrometracing_logger.h index 7f9bec1c32a53..81005aa91c10d 100644 --- a/paddle/fluid/platform/profiler/chrometracing_logger.h +++ b/paddle/fluid/platform/profiler/chrometracing_logger.h @@ -40,7 +40,7 @@ class ChromeTracingLogger : public BaseLogger { void LogNodeTrees(const NodeTrees&) override; void LogExtraInfo(const std::unordered_map); void LogMemTraceEventNode(const MemTraceEventNode&) override; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) void LogDeviceProperty( const std::map& device_property_map); #endif diff --git a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc index 1d0970235a128..cc35371e06fc5 100644 --- a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc +++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc @@ -129,7 +129,7 @@ std::unique_ptr DeserializationReader::Parse() { // restore NodeTrees object std::unique_ptr tree(new NodeTrees(thread_event_trees_map)); // restore gpuDeviceProp -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) std::map device_property_map; for (auto indx = 0; indx < node_trees_proto_->device_property_size(); indx++) { @@ -155,7 +155,7 @@ DeserializationReader::~DeserializationReader() { input_file_stream_.close(); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) gpuDeviceProp DeserializationReader::RestoreDeviceProperty( const DevicePropertyProto& device_property_proto) { gpuDeviceProp device_property; diff --git a/paddle/fluid/platform/profiler/dump/deserialization_reader.h b/paddle/fluid/platform/profiler/dump/deserialization_reader.h index 5f99f6fd82c55..c8ac33c5bea49 100644 --- a/paddle/fluid/platform/profiler/dump/deserialization_reader.h +++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.h @@ -39,7 +39,7 @@ class DeserializationReader { MemTraceEventNode* RestoreMemTraceEventNode(const MemTraceEventNodeProto&); OperatorSupplementEventNode* RestoreOperatorSupplementEventNode( const OperatorSupplementEventNodeProto&); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) gpuDeviceProp RestoreDeviceProperty(const DevicePropertyProto&); #endif diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.cc b/paddle/fluid/platform/profiler/dump/serialization_logger.cc index be1e1c01f8b52..9e46e3a531cd9 100644 --- a/paddle/fluid/platform/profiler/dump/serialization_logger.cc +++ b/paddle/fluid/platform/profiler/dump/serialization_logger.cc @@ -40,7 +40,7 @@ void SerializationLogger::OpenFile() { node_trees_proto_ = new NodeTreesProto(); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) void SerializationLogger::LogDeviceProperty( const std::map& device_property_map) { for (auto it = device_property_map.begin(); it != device_property_map.end(); diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.h b/paddle/fluid/platform/profiler/dump/serialization_logger.h index 80d5413106ded..67eafdf44e3cd 100644 --- a/paddle/fluid/platform/profiler/dump/serialization_logger.h +++ b/paddle/fluid/platform/profiler/dump/serialization_logger.h @@ -37,7 +37,7 @@ class SerializationLogger : public BaseLogger { void LogNodeTrees(const NodeTrees&) override; void LogExtraInfo(const std::unordered_map); void LogMemTraceEventNode(const MemTraceEventNode&) override; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) void LogDeviceProperty( const std::map& device_property_map); #endif diff --git a/paddle/fluid/platform/profiler/event_python.cc b/paddle/fluid/platform/profiler/event_python.cc index eaea4f3850fef..7ec41fd78a5e3 100644 --- a/paddle/fluid/platform/profiler/event_python.cc +++ b/paddle/fluid/platform/profiler/event_python.cc @@ -137,7 +137,7 @@ HostPythonNode* ProfilerResult::CopyTree(HostTraceEventNode* root) { return host_python_node; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) ProfilerResult::ProfilerResult( std::unique_ptr tree, const ExtraInfo& extra_info, @@ -179,7 +179,7 @@ void ProfilerResult::Save(const std::string& file_name, if (format == std::string("json")) { ChromeTracingLogger logger(file_name); logger.LogMetaInfo(version_, span_indx_); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) logger.LogDeviceProperty(device_property_map_); #endif tree_->LogMe(&logger); @@ -187,7 +187,7 @@ void ProfilerResult::Save(const std::string& file_name, } else if (format == std::string("pb")) { SerializationLogger logger(file_name); logger.LogMetaInfo(version_, span_indx_); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) logger.LogDeviceProperty(device_property_map_); #endif tree_->LogMe(&logger); diff --git a/paddle/fluid/platform/profiler/event_python.h b/paddle/fluid/platform/profiler/event_python.h index dae32a1902834..f1d217674bf6c 100644 --- a/paddle/fluid/platform/profiler/event_python.h +++ b/paddle/fluid/platform/profiler/event_python.h @@ -138,7 +138,7 @@ struct HostPythonNode { class ProfilerResult { public: ProfilerResult() : tree_(nullptr) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) explicit ProfilerResult( std::unique_ptr tree, const ExtraInfo& extra_info, @@ -166,7 +166,7 @@ class ProfilerResult { std::string GetVersion() { return version_; } uint32_t GetSpanIndx() { return span_indx_; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) std::map GetDeviceProperty() { return device_property_map_; } @@ -176,7 +176,7 @@ class ProfilerResult { std::map thread_event_trees_map_; std::shared_ptr tree_; ExtraInfo extra_info_; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) std::map device_property_map_; #endif std::string version_; diff --git a/paddle/fluid/platform/profiler/profiler.cc b/paddle/fluid/platform/profiler/profiler.cc index e0a91629a10d6..8f34d5acc0bee 100644 --- a/paddle/fluid/platform/profiler/profiler.cc +++ b/paddle/fluid/platform/profiler/profiler.cc @@ -21,7 +21,7 @@ #ifdef PADDLE_WITH_HIP #include #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/fluid/platform/device/gpu/gpu_info.h" #endif #include "paddle/fluid/platform/enforce.h" @@ -161,7 +161,7 @@ std::unique_ptr Profiler::Stop() { std::string("%s"), kv.second.c_str()); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) std::map device_property_map; std::vector device_ids = GetSelectedDevices(); for (auto index = 0u; index < device_ids.size(); index++) { diff --git a/paddle/fluid/platform/profiler_helper.h b/paddle/fluid/platform/profiler_helper.h index 1d34d5fd27b3e..5dad7788d0b09 100644 --- a/paddle/fluid/platform/profiler_helper.h +++ b/paddle/fluid/platform/profiler_helper.h @@ -142,7 +142,7 @@ void PrintMemProfiler( << " Memory Profiling Report " << "<-------------------------\n\n"; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) int num_gpus = GetGPUDeviceCount(); std::cout.setf(std::ios::left); if (num_gpus > 0) { @@ -344,7 +344,7 @@ void SetEvent(bool merge_thread, if (rit != pushed_events->rend()) { double event_time = 0; double gpu_time = 0.0f; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) gpu_time = rit->CudaElapsedMs(analyze_event); #endif double cpu_time = rit->CpuElapsedMs(analyze_event); diff --git a/paddle/fluid/platform/profiler_test.cc b/paddle/fluid/platform/profiler_test.cc index 0e1c681288fe1..1b746df388a2b 100644 --- a/paddle/fluid/platform/profiler_test.cc +++ b/paddle/fluid/platform/profiler_test.cc @@ -122,7 +122,7 @@ TEST(RecordEvent, RecordEvent) { if (events[i][j].name() == "_start_profiler_") ++start_profiler_count; if (events[i][j].name() == "push") { EXPECT_EQ(events[i][j + 1].name(), "pop"); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) EXPECT_GT(events[i][j].CudaElapsedMs(events[i][j + 1]), 0); #else EXPECT_GT(events[i][j].CpuElapsedMs(events[i][j + 1]), 0); diff --git a/paddle/fluid/pybind/cuda_streams_py.cc b/paddle/fluid/pybind/cuda_streams_py.cc index 2b8969e1b8181..da6dee7657c09 100644 --- a/paddle/fluid/pybind/cuda_streams_py.cc +++ b/paddle/fluid/pybind/cuda_streams_py.cc @@ -24,7 +24,7 @@ namespace py = pybind11; namespace paddle { namespace platform { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) phi::CUDAStream *get_current_stream(int device_id) { if (device_id == -1) { device_id = phi::backends::gpu::GetCurrentDeviceId(); @@ -51,7 +51,7 @@ void BindCudaStream(py::module *m_ptr) { m.def( "_get_current_stream", [](int deviceId) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) return platform::get_current_stream(deviceId); #else PADDLE_THROW( @@ -64,7 +64,7 @@ void BindCudaStream(py::module *m_ptr) { m.def( "_set_current_stream", [](phi::CUDAStream *stream) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) return platform::set_current_stream(stream); #else PADDLE_THROW( @@ -75,7 +75,7 @@ void BindCudaStream(py::module *m_ptr) { py::return_value_policy::reference); m.def("_device_synchronize", [](int device_id) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (device_id == -1) { device_id = paddle::platform::GetCurrentDeviceId(); } @@ -115,7 +115,7 @@ void BindCudaStream(py::module *m_ptr) { s3 = paddle.device.cuda.Stream() )DOC") -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) .def( "wait_event", [](phi::CUDAStream &self, paddle::platform::CudaEvent &event) { @@ -251,7 +251,7 @@ void BindCudaStream(py::module *m_ptr) { .def( "__init__", [](phi::CUDAStream &self, platform::CUDAPlace *place, int priority) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (priority != 1 && priority != 2) { PADDLE_THROW(platform::errors::InvalidArgument( "Priority should be 1(high) or 2(normal) ")); @@ -277,7 +277,7 @@ void BindCudaStream(py::module *m_ptr) { .def( "__init__", [](phi::CUDAStream &self, int device, int priority) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (priority != 1 && priority != 2) { PADDLE_THROW(platform::errors::InvalidArgument( "Priority should be 1(high) or 2(normal) ")); @@ -307,7 +307,7 @@ void BindCudaStream(py::module *m_ptr) { py::arg("device") = -1, py::arg("priority") = 2) .def("__init__", [](phi::CUDAStream &self) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) int device_id = platform::GetCurrentDeviceId(); auto stream_flag = phi::CUDAStream::StreamFlag::kStreamNonBlocking; new (&self) phi::CUDAStream( @@ -334,7 +334,7 @@ void BindCudaStream(py::module *m_ptr) { event = paddle.device.cuda.Event() )DOC") -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) .def( "record", [](paddle::platform::CudaEvent &self, phi::CUDAStream *stream) { @@ -398,7 +398,7 @@ void BindCudaStream(py::module *m_ptr) { bool enable_timing, bool blocking, bool interprocess) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) unsigned int flags = platform::GenerateDeviceEventFlag( enable_timing, blocking, interprocess); new (&self) paddle::platform::CudaEvent(flags); diff --git a/paddle/fluid/pybind/cuda_streams_py.h b/paddle/fluid/pybind/cuda_streams_py.h index d10608a6e8ea9..61f27960e25e9 100644 --- a/paddle/fluid/pybind/cuda_streams_py.h +++ b/paddle/fluid/pybind/cuda_streams_py.h @@ -17,7 +17,7 @@ #include "pybind11/pybind11.h" #include "pybind11/stl.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/phi/core/cuda_stream.h" #else namespace phi { @@ -29,7 +29,7 @@ namespace py = pybind11; namespace paddle { namespace platform { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) phi::CUDAStream* get_current_stream(int device_id = -1); phi::CUDAStream* set_current_stream(phi::CUDAStream* stream); #endif diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc index 59a94a31c448d..7fdfcfe62f6a6 100644 --- a/paddle/fluid/pybind/eager_functions.cc +++ b/paddle/fluid/pybind/eager_functions.cc @@ -58,7 +58,7 @@ typedef SSIZE_T ssize_t; #include "pybind11/numpy.h" #include "pybind11/pybind11.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/fluid/pybind/cuda_streams_py.h" #endif diff --git a/paddle/fluid/pybind/eager_math_op_patch.cc b/paddle/fluid/pybind/eager_math_op_patch.cc index 69d0465bf7cdd..46e099c1ecf5f 100644 --- a/paddle/fluid/pybind/eager_math_op_patch.cc +++ b/paddle/fluid/pybind/eager_math_op_patch.cc @@ -138,7 +138,7 @@ std::set _complex_dtypes{ void SetDevice(paddle::platform::Place place) { if (paddle::platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) phi::backends::gpu::SetDeviceId(place.device); VLOG(6) << "CurrentDeviceId: " << phi::backends::gpu::GetCurrentDeviceId() << " from " << static_cast(place.device); diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index eb0e895cf575c..638542ea6dbaf 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -223,7 +223,7 @@ static PyObject* tensor_method_numpy(TensorObject* self, sizeof_dtype * numel); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) } else if (self->tensor.is_gpu()) { eager_gil_scoped_release guard; #if defined(PADDLE_WITH_CUDA) @@ -1338,7 +1338,7 @@ static PyObject* tensor_method__setitem_eager_tensor(TensorObject* self, self_numpy[_index] = py::object(py::handle(value_obj), true); } if (!self->tensor.initialized()) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) SetTensorFromPyArray(self_tensor, self_numpy, platform::Place(platform::CUDAPlace(0)), diff --git a/paddle/fluid/pybind/generator_py.cc b/paddle/fluid/pybind/generator_py.cc index 99621b1463ea9..c9a4e2b7fb52e 100644 --- a/paddle/fluid/pybind/generator_py.cc +++ b/paddle/fluid/pybind/generator_py.cc @@ -40,7 +40,7 @@ void BindGenerator(py::module* m_ptr) { [](std::shared_ptr& self) { return self->current_seed; }) -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) // NOTE(shenliang03): Due to the inability to serialize mt19937_64 // type, resulting in a problem with precision under the cpu. .def(py::pickle( diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index bdf54bd76b6e1..72b47bb154513 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -43,7 +43,7 @@ #include "paddle/phi/api/include/tensor.h" #include "paddle/phi/core/compat/convert_utils.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/phi/core/cuda_stream.h" #endif @@ -658,7 +658,7 @@ void BindPaddlePredictor(py::module *m) { .def("get_output_names", &PaddlePredictor::GetOutputNames) .def("zero_copy_run", &PaddlePredictor::ZeroCopyRun) .def("clone", [](PaddlePredictor &self) { return self.Clone(nullptr); }) -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) .def("clone", [](PaddlePredictor &self, phi::CUDAStream &stream) { return self.Clone(stream.raw_stream()); @@ -705,7 +705,7 @@ void BindNativePredictor(py::module *m) { .def("zero_copy_run", &NativePaddlePredictor::ZeroCopyRun) .def("clone", [](NativePaddlePredictor &self) { return self.Clone(nullptr); }) -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) .def("clone", [](NativePaddlePredictor &self, phi::CUDAStream &stream) { return self.Clone(stream.raw_stream()); @@ -750,7 +750,7 @@ void BindAnalysisConfig(py::module *m) { .def("exp_enable_use_cutlass", &AnalysisConfig::Exp_EnableUseCutlass) .def("exp_disable_mixed_precision_ops", &AnalysisConfig::Exp_DisableMixedPrecisionOps) -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) .def("set_exec_stream", [](AnalysisConfig &self, phi::CUDAStream &stream) { self.SetExecStream(stream.raw_stream()); @@ -1084,7 +1084,7 @@ void BindAnalysisPredictor(py::module *m) { &AnalysisPredictor::analysis_argument, py::return_value_policy::reference) .def("clone", [](AnalysisPredictor &self) { return self.Clone(nullptr); }) -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) .def("clone", [](AnalysisPredictor &self, phi::CUDAStream &stream) { return self.Clone(stream.raw_stream()); @@ -1122,7 +1122,7 @@ void BindPaddleInferPredictor(py::module *m) { .def("run", [](paddle_infer::Predictor &self) { self.Run(); }) .def("clone", [](paddle_infer::Predictor &self) { return self.Clone(nullptr); }) -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) .def("clone", [](paddle_infer::Predictor &self, phi::CUDAStream &stream) { return self.Clone(stream.raw_stream()); diff --git a/paddle/fluid/pybind/parallel_executor.cc b/paddle/fluid/pybind/parallel_executor.cc index 9ba115381a2c0..013cac0851154 100644 --- a/paddle/fluid/pybind/parallel_executor.cc +++ b/paddle/fluid/pybind/parallel_executor.cc @@ -126,7 +126,7 @@ limitations under the License. */ #include "paddle/fluid/pybind/reader_py.h" #include "paddle/fluid/pybind/tensor_py.h" #include "paddle/fluid/string/to_string.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/operators/nccl/nccl_gpu_common.h" #endif diff --git a/paddle/fluid/pybind/place.cc b/paddle/fluid/pybind/place.cc index c97bba9be8f2f..2edb4c80d4897 100644 --- a/paddle/fluid/pybind/place.cc +++ b/paddle/fluid/pybind/place.cc @@ -126,11 +126,11 @@ limitations under the License. */ #include "paddle/fluid/pybind/reader_py.h" #include "paddle/fluid/pybind/tensor_py.h" #include "paddle/fluid/string/to_string.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_MUSA) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/operators/nccl/nccl_gpu_common.h" #endif -#ifndef PADDLE_WITH_HIP +#ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h" #endif #include "paddle/fluid/platform/device/gpu/gpu_info.h" @@ -318,7 +318,7 @@ void BindPlace(pybind11::module &m) { // NOLINT cudaplace .def("__init__", [](platform::CUDAPlace &self, int dev_id) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_MUSA) if (UNLIKELY(dev_id < 0)) { LOG(ERROR) << string::Sprintf( "Invalid CUDAPlace(%d), device id must be 0 or " @@ -357,7 +357,7 @@ void BindPlace(pybind11::module &m) { // NOLINT std::exit(-1); #endif }) -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_MUSA) .def("get_device_id", [](const platform::CUDAPlace &self) { return self.GetDeviceId(); }) .def("_type", &PlaceIndex) @@ -372,10 +372,10 @@ void BindPlace(pybind11::module &m) { // NOLINT #endif .def("__repr__", string::to_string) .def("__str__", string::to_string); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool { // Only GPUs with Compute Capability >= 53 support float16 -#ifdef PADDLE_WITH_HIP +#ifdef PADDLE_WITH_HIP || PADDLE_WITH_MUSA return true; #else return platform::GetGPUComputeCapability(place.device) >= 53; @@ -383,7 +383,7 @@ void BindPlace(pybind11::module &m) { // NOLINT }); m.def("is_bfloat16_supported", [](const platform::CUDAPlace &place) -> bool { // Only GPUs with Compute Capability >= 80 support bfloat16 -#ifdef PADDLE_WITH_HIP +#ifdef PADDLE_WITH_HIP || PADDLE_WITH_MUSA return false; #else return platform::GetGPUComputeCapability(place.device) >= 80; @@ -540,7 +540,7 @@ void BindPlace(pybind11::module &m) { // NOLINT cudapinnedplace .def("__init__", [](platform::CUDAPinnedPlace &self) { -#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) +#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) PADDLE_THROW(platform::errors::PermissionDenied( "Cannot use CUDAPinnedPlace in CPU only version, " "Please recompile or reinstall Paddle with CUDA support.")); diff --git a/paddle/fluid/pybind/process_group_utils.h b/paddle/fluid/pybind/process_group_utils.h index 1a6b640b3a3cf..1a5a048a61383 100644 --- a/paddle/fluid/pybind/process_group_utils.h +++ b/paddle/fluid/pybind/process_group_utils.h @@ -250,7 +250,7 @@ void ConcatTensor(const phi::DeviceContext &dev_ctx, const auto &place = dev_ctx.GetPlace(); if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) ConcatDenseTensorWithType(static_cast(dev_ctx), tensor_list, dense_tensor, @@ -307,7 +307,7 @@ void SplitTensor(const phi::DeviceContext &dev_ctx, const auto &place = dev_ctx.GetPlace(); if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) SplitDenseTensorWithType(static_cast(dev_ctx), tensor, &dense_list, diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 60ade1f9875fd..3f5fffc1bc036 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -144,7 +144,7 @@ limitations under the License. */ #include "paddle/fluid/pybind/tensor.h" #include "paddle/fluid/pybind/tensor_py.h" #include "paddle/fluid/string/to_string.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/operators/nccl/nccl_gpu_common.h" #endif @@ -776,7 +776,7 @@ PYBIND11_MODULE(libpaddle, m) { } }); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) m.def("cudnn_version", &platform::DnnVersion); m.def("gpu_memory_available", []() { size_t available = 0; @@ -828,7 +828,7 @@ PYBIND11_MODULE(libpaddle, m) { if (dl.device.device_type == kDLCPU) { paddle::framework::TensorFromDLPack(dmt, &tensor); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (dl.device.device_type == kDLGPU) { paddle::framework::TensorFromDLPack(dmt, &tensor); } @@ -2199,7 +2199,7 @@ All parameter, weight, gradient are variables in Paddle. py::return_value_policy::take_ownership); m.def("op_support_gpu", OpSupportGPU); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) m.def("get_cuda_device_count", platform::GetGPUDeviceCount); m.def("get_cuda_current_device_id", &platform::GetCurrentDeviceId); m.def("cuda_empty_cache", [] { @@ -2320,7 +2320,7 @@ All parameter, weight, gradient are variables in Paddle. .def("save", &paddle::platform::ProfilerResult::Save) .def("get_extra_info", &paddle::platform::ProfilerResult::GetExtraInfo) .def("get_version", &paddle::platform::ProfilerResult::GetVersion) -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) .def("get_span_indx", &paddle::platform::ProfilerResult::GetSpanIndx) .def("get_device_property", &paddle::platform::ProfilerResult::GetDeviceProperty); @@ -2477,7 +2477,7 @@ All parameter, weight, gradient are variables in Paddle. m.def("enable_op_info_recorder", &phi::EnableOpInfoRecorder); m.def("disable_op_info_recorder", &phi::DisableOpInfoRecorder); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) m.def("set_cublas_switch", phi::SetAllowTF32Cublas); m.def("get_cublas_switch", phi::AllowTF32Cublas); m.def("set_cudnn_switch", phi::SetAllowTF32Cudnn); diff --git a/paddle/fluid/pybind/tensor.cc b/paddle/fluid/pybind/tensor.cc index 98ae45dd0134b..a9ce5910d4eb4 100644 --- a/paddle/fluid/pybind/tensor.cc +++ b/paddle/fluid/pybind/tensor.cc @@ -126,7 +126,7 @@ limitations under the License. */ #include "paddle/fluid/pybind/reader_py.h" #include "paddle/fluid/pybind/tensor_py.h" #include "paddle/fluid/string/to_string.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/operators/nccl/nccl_gpu_common.h" #endif diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index ba33fcd1d129f..8b4f4dcd62de1 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -37,7 +37,7 @@ limitations under the License. */ #include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/pybind/complex.h" #include "paddle/phi/kernels/funcs/strided_memcpy.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/fluid/platform/cuda_device_guard.h" #endif #include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h" @@ -325,7 +325,7 @@ T TensorGetElement(const phi::DenseTensor &self, size_t offset) { #endif } else if (platform::is_gpu_place(self.place()) || platform::is_cuda_pinned_place(self.place())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) const T *a = self.data(); auto p = self.place(); paddle::memory::Copy( @@ -362,7 +362,7 @@ void TensorSetElement(phi::DenseTensor *self, size_t offset, T elem) { #endif } else if (platform::is_gpu_place(self->place()) || platform::is_cuda_pinned_place(self->place())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) auto p = self->place(); T *a = self->mutable_data(p); paddle::memory::Copy( @@ -457,7 +457,7 @@ void SetTensorFromPyArrayT( "Please recompile or reinstall Paddle with CustomDevice support.")); #endif } else { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (paddle::platform::is_gpu_place(place)) { // NOTE(wangxi): When copying data to the accelerator card, // we need set_device(dev_id) first. @@ -790,7 +790,7 @@ inline phi::DenseTensor *_getTensor(const phi::DenseTensor &self, output->mutable_data(place, self.dtype()); #endif } else { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (platform::is_cuda_pinned_place(place)) { output->mutable_data(place, self.dtype()); } else if ((platform::is_gpu_place(place))) { @@ -1039,7 +1039,7 @@ inline py::array TensorToPyArray(const phi::DenseTensor &tensor, "Please recompile or reinstall Paddle with XPU support.")); #endif } else if (is_gpu_tensor) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides); PADDLE_ENFORCE_EQ(py_arr.writeable(), true, diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt index 1ed3fac122826..593109d3e8e27 100644 --- a/paddle/phi/CMakeLists.txt +++ b/paddle/phi/CMakeLists.txt @@ -123,6 +123,9 @@ if(WITH_GPU) elseif(WITH_ROCM) hip_add_library(phi ${PHI_BUILD_TYPE} ${PHI_SRCS}) target_link_libraries(phi ${PHI_DEPS}) +elseif(WITH_MUSA) + musa_add_library(phi ${PHI_BUILD_TYPE} ${PHI_SRCS}) + target_link_libraries(phi ${PHI_DEPS}) elseif(WITH_XPU_KP) xpu_library( phi ${PHI_BUILD_TYPE} diff --git a/paddle/phi/api/include/context_pool.h b/paddle/phi/api/include/context_pool.h index 7afe17ba8419d..65ddeceb7014c 100644 --- a/paddle/phi/api/include/context_pool.h +++ b/paddle/phi/api/include/context_pool.h @@ -97,7 +97,7 @@ namespace paddle { */ PADDLE_API phi::Allocator* GetAllocator(const phi::Place& place); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) /** * Get the current CUDA stream for the passed CUDA device. */ diff --git a/paddle/phi/api/include/tensor.h b/paddle/phi/api/include/tensor.h index b626df6c6701c..4224aeae2b5c3 100644 --- a/paddle/phi/api/include/tensor.h +++ b/paddle/phi/api/include/tensor.h @@ -396,7 +396,7 @@ class PADDLE_API Tensor final { */ void set_impl(std::shared_ptr&& impl); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) /** * @brief Get the stream where the tensor is currently located * This is a deprecated method and may be removed in the future! diff --git a/paddle/phi/api/lib/context_pool.cc b/paddle/phi/api/lib/context_pool.cc index 292bd8a7e47aa..b3badfdb94ff7 100644 --- a/paddle/phi/api/lib/context_pool.cc +++ b/paddle/phi/api/lib/context_pool.cc @@ -19,7 +19,7 @@ limitations under the License. */ #include "paddle/phi/core/allocator.h" #include "paddle/phi/core/enforce.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/phi/core/cuda_stream.h" #endif @@ -63,7 +63,7 @@ PADDLE_API phi::Allocator* GetAllocator(const phi::Place& place) { return const_cast(&dev_ctx->GetAllocator()); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PADDLE_API phi::CUDAStream* GetCurrentCUDAStream(const phi::Place& place) { PADDLE_ENFORCE_EQ(place.GetType(), phi::AllocationType::GPU, diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc index f9316965be26b..eac1d34ada374 100644 --- a/paddle/phi/api/lib/data_transform.cc +++ b/paddle/phi/api/lib/data_transform.cc @@ -93,7 +93,7 @@ phi::DenseTensor CastDataType(const Context& dev_ctx, } } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) phi::DenseTensor CastDataType(const phi::GPUContext& dev_ctx, const phi::DenseTensor& tensor, DataType dtype) { @@ -135,7 +135,7 @@ inline phi::DenseTensor TransDataType(const phi::DenseTensor& tensor, if (tensor.place().GetType() == phi::AllocationType::CPU) { auto* dev_ctx = static_cast(pool.Get(tensor.place())); return CastDataType(*dev_ctx, tensor, dtype); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) } else if (tensor.place().GetType() == phi::AllocationType::GPU) { auto* dev_ctx = static_cast(pool.Get(tensor.place())); return CastDataType(*dev_ctx, tensor, dtype); @@ -153,7 +153,7 @@ inline phi::DenseTensor TransDataPlace(const phi::DenseTensor& tensor, << " dst_place: " << dst_place; auto& pool = phi::DeviceContextPool::Instance(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) // NOTE(yy): TransDataPlace should wait for computation of input. if (tensor.place().GetType() != phi::AllocationType::GPUPINNED) { pool.Get(tensor.place())->Wait(); diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc index e8caf52530868..4a0b8426fa8d8 100644 --- a/paddle/phi/api/lib/tensor.cc +++ b/paddle/phi/api/lib/tensor.cc @@ -359,7 +359,7 @@ void Tensor::set_impl(std::shared_ptr &&impl) { impl_ = std::move(impl); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) gpuStream_t Tensor::stream() const { int device_id = phi::backends::gpu::GetCurrentDeviceId(); auto *gpu_context = DeviceContextPool::Instance().Get( diff --git a/paddle/phi/api/lib/tensor_utils.cc b/paddle/phi/api/lib/tensor_utils.cc index b8d25e4f22b10..3384b59158703 100644 --- a/paddle/phi/api/lib/tensor_utils.cc +++ b/paddle/phi/api/lib/tensor_utils.cc @@ -17,7 +17,7 @@ limitations under the License. */ #include "paddle/phi/api/lib/api_registry.h" #include "paddle/phi/core/dense_tensor.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #ifdef PADDLE_WITH_CUDA #include #else @@ -30,7 +30,7 @@ namespace paddle { PD_REGISTER_API(from_blob) phi::Place GetPlaceFromPtr(void* data) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #ifdef PADDLE_WITH_CUDA #if CUDA_VERSION >= 10000 cudaPointerAttributes attr; diff --git a/paddle/phi/api/profiler/event.h b/paddle/phi/api/profiler/event.h index b19f20485227b..ebd613e4a8099 100644 --- a/paddle/phi/api/profiler/event.h +++ b/paddle/phi/api/profiler/event.h @@ -28,7 +28,7 @@ limitations under the License. */ #include #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/phi/core/cuda_stream.h" #endif @@ -62,7 +62,7 @@ class Event { void set_name(std::string name) { name_ = name; } void set_role(EventRole role) { role_ = role; } std::string attr() const { return attr_; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #ifndef PADDLE_WITH_CUPTI gpuEvent_t event() const { return event_; } int device() const { return device_; } @@ -81,7 +81,7 @@ class Event { int64_t cpu_ns_; bool visited_status_{false}; std::string attr_; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #ifdef PADDLE_WITH_CUPTI int64_t gpu_ns_ = 0; @@ -137,7 +137,7 @@ class MemEvent { }; class CudaEvent { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) public: CudaEvent() { diff --git a/paddle/phi/backends/CMakeLists.txt b/paddle/phi/backends/CMakeLists.txt index 1c916682cf7b1..beb0f88e3efcf 100644 --- a/paddle/phi/backends/CMakeLists.txt +++ b/paddle/phi/backends/CMakeLists.txt @@ -7,7 +7,7 @@ if(NOT APPLE AND NOT WIN32) list(APPEND BACKENDS_SRCS device_code.cc) endif() -if(WITH_GPU OR WITH_ROCM) +if(WITH_GPU OR WITH_ROCM OR WITH_MUSA) list(APPEND BACKENDS_SRCS gpu/gpu_context.cc gpu/gpu_info.cc gpu/gpu_resources.cc) if(WITH_GPU) @@ -16,6 +16,9 @@ if(WITH_GPU OR WITH_ROCM) if(WITH_ROCM) list(APPEND BACKENDS_SRCS gpu/rocm/rocm_info.cc) endif() + if(WITH_MUSA) + list(APPEND BACKENDS_SRCS gpu/musa/musa_info.cc) + endif() endif() if(WITH_XPU) @@ -43,6 +46,7 @@ list( if(WITH_GPU OR WITH_ROCM + OR WITH_MUSA OR WITH_CUSTOM_DEVICE) list(APPEND BACKENDS_SRCS device_base.cc) endif() diff --git a/paddle/phi/backends/context_pool.cc b/paddle/phi/backends/context_pool.cc index e295ac388d892..372edd66e50d9 100644 --- a/paddle/phi/backends/context_pool.cc +++ b/paddle/phi/backends/context_pool.cc @@ -21,7 +21,7 @@ limitations under the License. */ namespace phi { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) bool allow_tf32_cublas = true; void SetAllowTF32Cublas(bool active) { allow_tf32_cublas = active; } bool AllowTF32Cublas() { return allow_tf32_cublas; } diff --git a/paddle/phi/backends/context_pool.h b/paddle/phi/backends/context_pool.h index 6ff90e05fed4a..efce5aac61a71 100644 --- a/paddle/phi/backends/context_pool.h +++ b/paddle/phi/backends/context_pool.h @@ -27,7 +27,7 @@ limitations under the License. */ namespace phi { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) void SetAllowTF32Cublas(bool active); /*Get the global variable allow_tf32_cublas value*/ bool AllowTF32Cublas(); @@ -46,7 +46,7 @@ struct DefaultDeviceContextType { using TYPE = phi::CPUContext; }; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) template <> struct DefaultDeviceContextType { using TYPE = phi::GPUContext; diff --git a/paddle/phi/backends/device_code.cc b/paddle/phi/backends/device_code.cc index eb2934d1b4842..27cdf09236d35 100644 --- a/paddle/phi/backends/device_code.cc +++ b/paddle/phi/backends/device_code.cc @@ -78,7 +78,7 @@ DeviceCodePool::DeviceCodePool(const std::vector& places) { } for (auto& p : set) { if (p.GetType() == phi::AllocationType::GPU) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) device_codes_.emplace(p, DeviceCodeMap()); #else PADDLE_THROW(phi::errors::PreconditionNotMet( @@ -88,12 +88,12 @@ DeviceCodePool::DeviceCodePool(const std::vector& places) { } } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) GPUDeviceCode::CheckAvailableStatus(); #endif } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #ifdef PADDLE_WITH_HIP static bool CheckCUDADriverResult(hipError_t result, std::string caller, diff --git a/paddle/phi/backends/device_code.h b/paddle/phi/backends/device_code.h index 8debb4dc9c45e..64b89b83b42ed 100644 --- a/paddle/phi/backends/device_code.h +++ b/paddle/phi/backends/device_code.h @@ -48,7 +48,7 @@ class DeviceCode { std::string kernel_; }; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) class GPUDeviceCode : public DeviceCode { public: explicit GPUDeviceCode(const Place& place, diff --git a/paddle/phi/backends/device_memory_aligment.h b/paddle/phi/backends/device_memory_aligment.h index 8508d5206558d..3a430132d9325 100644 --- a/paddle/phi/backends/device_memory_aligment.h +++ b/paddle/phi/backends/device_memory_aligment.h @@ -36,7 +36,7 @@ inline size_t Alignment(size_t size, if (place.GetType() == phi::AllocationType::CPU) { alignment = phi::backends::cpu::CpuMinChunkSize(); } else { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) alignment = phi::backends::gpu::GpuMinChunkSize(); #elif defined(PADDLE_WITH_XPU) alignment = phi::backends::xpu::XPUMinChunkSize(); diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc index 5c9c010d365e4..f10ec7019b7b6 100644 --- a/paddle/phi/backends/gpu/gpu_context.cc +++ b/paddle/phi/backends/gpu/gpu_context.cc @@ -1046,7 +1046,7 @@ void GPUContext::SetDnnAttr(const std::string& attr_name, Attribute attr) { void GPUContext::ClearDnnAttr() { return impl_->ClearDnnAttr(); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) GPUPinnedContext::GPUPinnedContext() { eigen_device_.reset(new Eigen::DefaultDevice()); } diff --git a/paddle/phi/backends/gpu/gpu_context.h b/paddle/phi/backends/gpu/gpu_context.h index b4a3974378241..2127114de189c 100644 --- a/paddle/phi/backends/gpu/gpu_context.h +++ b/paddle/phi/backends/gpu/gpu_context.h @@ -15,7 +15,7 @@ limitations under the License. */ #pragma once -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \ defined(PADDLE_WITH_XPU_KP) #include @@ -276,7 +276,7 @@ using GPUDNNContext = GPUContext; // because we want to implement a KPS-based kernel and make it run // on GPU and XPU at the same time, so we need KPSContext when registering // KPS Kernel. Note: XPU and GPU cannot be compiled at the same time! -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) using KPSContext = GPUContext; #endif @@ -287,7 +287,7 @@ struct DefaultDevice; } // namespace Eigen namespace phi { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) // Currently, GPUPinnedContext is only used to data copying. class GPUPinnedContext : public DeviceContext, diff --git a/paddle/phi/backends/gpu/gpu_device_function.h b/paddle/phi/backends/gpu/gpu_device_function.h index 0f79e2a645ab3..de4565cb6e7ce 100644 --- a/paddle/phi/backends/gpu/gpu_device_function.h +++ b/paddle/phi/backends/gpu/gpu_device_function.h @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #ifdef PADDLE_WITH_HIP #include "paddle/phi/backends/gpu/rocm/rocm_device_function.h" diff --git a/paddle/phi/backends/gpu/gpu_dnn.h b/paddle/phi/backends/gpu/gpu_dnn.h index f37afa3deeb74..44163d8048f2c 100644 --- a/paddle/phi/backends/gpu/gpu_dnn.h +++ b/paddle/phi/backends/gpu/gpu_dnn.h @@ -14,7 +14,7 @@ #pragma once -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #ifdef PADDLE_WITH_HIP #include "paddle/phi/backends/gpu/rocm/miopen_desc.h" diff --git a/paddle/phi/backends/gpu/gpu_helper.h b/paddle/phi/backends/gpu/gpu_helper.h index 2353b42794ffd..428c5dcb96c6a 100644 --- a/paddle/phi/backends/gpu/gpu_helper.h +++ b/paddle/phi/backends/gpu/gpu_helper.h @@ -13,7 +13,7 @@ // limitations under the License. #pragma once -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #ifdef PADDLE_WITH_HIP #include "paddle/phi/backends/gpu/rocm/rocm_helper.h" diff --git a/paddle/phi/backends/gpu/gpu_info.h b/paddle/phi/backends/gpu/gpu_info.h index ebf57bd06eb19..2d1b7c1a98f27 100644 --- a/paddle/phi/backends/gpu/gpu_info.h +++ b/paddle/phi/backends/gpu/gpu_info.h @@ -11,7 +11,7 @@ limitations under the License. */ #pragma once -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include diff --git a/paddle/phi/backends/gpu/gpu_launch_config.h b/paddle/phi/backends/gpu/gpu_launch_config.h index a7a7ad03ad664..00aa244041bec 100644 --- a/paddle/phi/backends/gpu/gpu_launch_config.h +++ b/paddle/phi/backends/gpu/gpu_launch_config.h @@ -16,7 +16,7 @@ #pragma once -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #ifdef PADDLE_WITH_CUDA #include diff --git a/paddle/phi/backends/gpu/gpu_types.h b/paddle/phi/backends/gpu/gpu_types.h index 77f403795b6b3..effab17059ac4 100644 --- a/paddle/phi/backends/gpu/gpu_types.h +++ b/paddle/phi/backends/gpu/gpu_types.h @@ -17,7 +17,7 @@ #include "paddle/phi/backends/gpu/forwards.h" #include "paddle/phi/backends/gpu/gpu_decls.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #ifdef PADDLE_WITH_HIP #include "paddle/phi/backends/dynload/miopen.h" @@ -80,4 +80,4 @@ DECLARE_CONSTANT_FOR_GPU(gpuMemcpyDeviceToDevice, #undef DECLARE_CONSTANT_FOR_GPU } // namespace phi -#endif // defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#endif // defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) diff --git a/paddle/phi/backends/gpu/musa/musa_info.cc b/paddle/phi/backends/gpu/musa/musa_info.cc new file mode 100644 index 0000000000000..6579ce63f21f6 --- /dev/null +++ b/paddle/phi/backends/gpu/musa/musa_info.cc @@ -0,0 +1,329 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "paddle/phi/backends/gpu/gpu_info.h" + +#include "paddle/phi/core/enforce.h" + +#include "musa_runtime_api.h" + +static std::once_flag g_device_props_size_init_flag; +static std::vector> g_device_props_init_flags; +static std::vector g_device_props; + +namespace phi { +namespace backends { +namespace gpu { + +int DnnVersion() { + return 0.0.0; + //if (!dynload::HasCUDNN()) return -1; + //size_t version_major, version_minor, version_patch; + //PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenGetVersion( + // &version_major, &version_minor, &version_patch)); + //return version_major * 100 + version_minor * 10 + version_patch; +} + +static int GetGPUDeviceCountImpl() { + int driverVersion = 0; + musaError_t status = musaDriverGetVersion(&driverVersion); + + if (!(status == gpuSuccess && driverVersion != 0)) { + // No GPU driver + VLOG(2) << "GPU Driver Version can't be detected. No GPU driver!"; + return 0; + } + + const auto *cuda_visible_devices = std::getenv("MUSA_VISIBLE_DEVICES"); + + if (cuda_visible_devices != nullptr) { + std::string cuda_visible_devices_str(cuda_visible_devices); + if (!cuda_visible_devices_str.empty()) { + cuda_visible_devices_str.erase( + 0, cuda_visible_devices_str.find_first_not_of('\'')); + cuda_visible_devices_str.erase( + cuda_visible_devices_str.find_last_not_of('\'') + 1); + cuda_visible_devices_str.erase( + 0, cuda_visible_devices_str.find_first_not_of('\"')); + cuda_visible_devices_str.erase( + cuda_visible_devices_str.find_last_not_of('\"') + 1); + } + if (std::all_of(cuda_visible_devices_str.begin(), + cuda_visible_devices_str.end(), + [](char ch) { return ch == ' '; })) { + VLOG(2) << "MUSA_VISIBLE_DEVICES is set to be " + "empty. No GPU detected."; + return 0; + } + } + int count; + PADDLE_ENFORCE_GPU_SUCCESS(musaGetDeviceCount(&count)); + return count; +} + +int GetGPUDeviceCount() { + // cache the count + static auto dev_cnt = GetGPUDeviceCountImpl(); + return dev_cnt; +} + +int GetGPUComputeCapability(int id) { + PADDLE_ENFORCE_LT( + id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, + GetGPUDeviceCount())); + return 100; + //int major, minor; + //auto major_error_code = musaDeviceGetAttribute( + // &major, musaDeviceAttributeComputeCapabilityMajor, id); + //auto minor_error_code = musaDeviceGetAttribute( + // &minor, musaDeviceAttributeComputeCapabilityMinor, id); + + //PADDLE_ENFORCE_GPU_SUCCESS(major_error_code); + //PADDLE_ENFORCE_GPU_SUCCESS(minor_error_code); + //return major * 100 + minor; +} + +int GetGPURuntimeVersion(int id) { + PADDLE_ENFORCE_LT( + id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, + GetGPUDeviceCount())); + int runtime_version = 0; + PADDLE_ENFORCE_GPU_SUCCESS(musaRuntimeGetVersion(&runtime_version)); + return runtime_version; +} + +int GetGPUDriverVersion(int id) { + PADDLE_ENFORCE_LT( + id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, + GetGPUDeviceCount())); + int driver_version = 0; + PADDLE_ENFORCE_GPU_SUCCESS(musaDriverGetVersion(&driver_version)); + return driver_version; +} + +bool TensorCoreAvailable() { return false; } + +int GetGPUMultiProcessors(int id) { + PADDLE_ENFORCE_LT( + id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, + GetGPUDeviceCount())); + int count; + PADDLE_ENFORCE_GPU_SUCCESS( + musaDeviceGetAttribute(&count, musaDeviceAttributeMultiprocessorCount, id)); + return count; +} + +int GetGPUMaxThreadsPerMultiProcessor(int id) { + PADDLE_ENFORCE_LT( + id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, + GetGPUDeviceCount())); + int count; + PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceGetAttribute( + &count, musaDeviceAttributeMaxThreadsPerMultiProcessor, id)); + + return count; +} + +int GetGPUMaxThreadsPerBlock(int id) { + PADDLE_ENFORCE_LT( + id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, + GetGPUDeviceCount())); + int count; + PADDLE_ENFORCE_GPU_SUCCESS( + musaDeviceGetAttribute(&count, musaDeviceAttributeMaxThreadsPerBlock, id)); + return count; +} + +int GetCurrentDeviceId() { + int device_id; + PADDLE_ENFORCE_GPU_SUCCESS(musaGetDevice(&device_id)); + return device_id; +} + +std::array GetGpuMaxGridDimSize(int id) { + PADDLE_ENFORCE_LT( + id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, + GetGPUDeviceCount())); + std::array ret; + int size; + auto error_code_x = + musaDeviceGetAttribute(&size, musaDeviceAttributeMaxGridDimX, id); + PADDLE_ENFORCE_GPU_SUCCESS(error_code_x); + ret[0] = size; + + auto error_code_y = + musaDeviceGetAttribute(&size, musaDeviceAttributeMaxGridDimY, id); + PADDLE_ENFORCE_GPU_SUCCESS(error_code_y); + ret[1] = size; + + auto error_code_z = + musaDeviceGetAttribute(&size, musaDeviceAttributeMaxGridDimZ, id); + PADDLE_ENFORCE_GPU_SUCCESS(error_code_z); + ret[2] = size; + return ret; +} + +std::pair GetGpuStreamPriorityRange() { + int least_priority, greatest_priority; + PADDLE_ENFORCE_GPU_SUCCESS( + musaDeviceGetStreamPriorityRange(&least_priority, &greatest_priority)); + return std::make_pair(least_priority, greatest_priority); +} + +const gpuDeviceProp &GetDeviceProperties(int id) { + std::call_once(g_device_props_size_init_flag, [&] { + int gpu_num = 0; + gpu_num = GetGPUDeviceCount(); + g_device_props_init_flags.resize(gpu_num); + g_device_props.resize(gpu_num); + for (int i = 0; i < gpu_num; ++i) { + g_device_props_init_flags[i] = std::make_unique(); + } + }); + + if (id == -1) { + id = GetCurrentDeviceId(); + } + + if (id < 0 || id >= static_cast(g_device_props.size())) { + PADDLE_THROW(phi::errors::OutOfRange( + "The device id %d is out of range [0, %d), where %d is the number of " + "devices on this machine. Because the device id should be greater than " + "or equal to zero and smaller than the number of gpus. Please input " + "appropriate device again!", + id, + static_cast(g_device_props.size()), + static_cast(g_device_props.size()))); + } + + std::call_once(*(g_device_props_init_flags[id]), [&] { + PADDLE_ENFORCE_GPU_SUCCESS(musaGetDeviceProperties(&g_device_props[id], id)); + }); + + return g_device_props[id]; +} + +void SetDeviceId(int id) { + PADDLE_ENFORCE_LT( + id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, + GetGPUDeviceCount())); + PADDLE_RETRY_CUDA_SUCCESS(musaSetDevice(id)); +} + +void GpuMemcpyAsync(void *dst, + const void *src, + size_t count, + gpuMemcpyKind kind, + gpuStream_t stream) { + PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpyAsync(dst, src, count, kind, stream)); +} + +void GpuMemcpySync(void *dst, + const void *src, + size_t count, + gpuMemcpyKind kind) { + PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpy(dst, src, count, kind)); +} + +void GpuMemcpyPeerAsync(void *dst, + int dst_device, + const void *src, + int src_device, + size_t count, + gpuStream_t stream) { + PADDLE_ENFORCE_GPU_SUCCESS( + musaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream)); +} + +void GpuMemcpyPeerSync( + void *dst, int dst_device, const void *src, int src_device, size_t count) { + PADDLE_ENFORCE_GPU_SUCCESS( + musaMemcpyPeer(dst, dst_device, src, src_device, count)); +} + +void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream) { + PADDLE_ENFORCE_GPU_SUCCESS(musaMemsetAsync(dst, value, count, stream)); +} + +void GpuStreamSync(gpuStream_t stream) { + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamSynchronize(stream)); +} + +void GpuDestroyStream(gpuStream_t stream) { + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamDestroy(stream)); +} + +void GpuDeviceSync() { PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize()); } + +gpuError_t GpuGetLastError() { return musaGetLastError(); } + +bool IsGPUManagedMemorySupported(int dev_id) { + PADDLE_ENFORCE_LT( + dev_id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + dev_id, + GetGPUDeviceCount())); + return false; +} + +bool IsGPUManagedMemoryOversubscriptionSupported(int dev_id) { + PADDLE_ENFORCE_LT( + dev_id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + dev_id, + GetGPUDeviceCount())); + return false; +} + +} // namespace gpu +} // namespace backends +} // namespace phi diff --git a/paddle/phi/capi/lib/c_device_context.cc b/paddle/phi/capi/lib/c_device_context.cc index 96b46fbc0d4ff..21df6c646cd3e 100644 --- a/paddle/phi/capi/lib/c_device_context.cc +++ b/paddle/phi/capi/lib/c_device_context.cc @@ -35,7 +35,7 @@ PD_Stream PD_DeviceContextGetStream(const PD_DeviceContext* ctx, reinterpret_cast(ctx)->stream()); } else if (dev_ctx_type == phi::AllocationType::CPU) { return nullptr; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) } else if (dev_ctx_type == phi::AllocationType::GPU) { return reinterpret_cast( reinterpret_cast(ctx)->stream()); diff --git a/paddle/phi/capi/lib/c_kernel_context.cc b/paddle/phi/capi/lib/c_kernel_context.cc index e9fe2aada1f35..7df79117dbae5 100644 --- a/paddle/phi/capi/lib/c_kernel_context.cc +++ b/paddle/phi/capi/lib/c_kernel_context.cc @@ -30,7 +30,7 @@ PD_DeviceContext* PD_KernelContextGetDeviceContext(PD_KernelContext* ctx) { } else if (dev_ctx_type == phi::AllocationType::CPU) { return reinterpret_cast(const_cast( &kernel_context->GetDeviceContext())); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) } else if (dev_ctx_type == phi::AllocationType::GPU) { return reinterpret_cast(const_cast( &kernel_context->GetDeviceContext())); diff --git a/paddle/phi/common/backend.h b/paddle/phi/common/backend.h index 5540592d5013c..3d0bf86c2bca6 100644 --- a/paddle/phi/common/backend.h +++ b/paddle/phi/common/backend.h @@ -138,7 +138,7 @@ inline Backend StringToBackend(const char* backend_cstr) { } else if (s == std::string("GPUDNN")) { return Backend::GPUDNN; } else if (s == std::string("KPS")) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) // NOTE(chenweihang) KPS is not yet a complete backend, and it still needs // to be converted // to GPU in the GPU environment diff --git a/paddle/phi/common/complex.h b/paddle/phi/common/complex.h index a4e003dd544ad..6df324c5ead11 100644 --- a/paddle/phi/common/complex.h +++ b/paddle/phi/common/complex.h @@ -37,7 +37,7 @@ #define PADDLE_ALIGN(x) __declspec(align(x)) #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) // todo #define PADDLE_WITH_CUDA_OR_HIP_COMPLEX #endif @@ -62,7 +62,7 @@ struct PADDLE_ALIGN(sizeof(T) * 2) complex { HOSTDEVICE complex(T real, T imag) : real(real), imag(imag) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) template HOSTDEVICE inline explicit complex(const thrust::complex& c) { diff --git a/paddle/phi/common/float16.h b/paddle/phi/common/float16.h index 86168d441ded2..094fc5681c04e 100644 --- a/paddle/phi/common/float16.h +++ b/paddle/phi/common/float16.h @@ -82,7 +82,7 @@ struct PADDLE_ALIGN(2) float16 { // Constructors #ifdef PADDLE_CUDA_FP16 HOSTDEVICE inline explicit float16(const half& h) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #if defined(PADDLE_WITH_HIP) || CUDA_VERSION >= 9000 x = reinterpret_cast<__half_raw*>(const_cast(&h))->x; #else diff --git a/paddle/phi/common/memory_utils.cc b/paddle/phi/common/memory_utils.cc index f9ef606049297..6dc6c1cba468d 100644 --- a/paddle/phi/common/memory_utils.cc +++ b/paddle/phi/common/memory_utils.cc @@ -69,7 +69,7 @@ int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id) { dev_id); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) void GpuMemoryUsage(size_t* available, size_t* total) { return MemoryUtils::Instance().GpuMemoryUsage(available, total); } diff --git a/paddle/phi/common/memory_utils.h b/paddle/phi/common/memory_utils.h index f6a4afcea2f78..3baf7bbe35624 100644 --- a/paddle/phi/common/memory_utils.h +++ b/paddle/phi/common/memory_utils.h @@ -118,7 +118,7 @@ struct MemoryInterface { int64_t (*device_memory_stat_current_value)(const std::string& stat_type, int dev_id); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) /** * @brief get the memory usage of current GPU device. * @@ -271,7 +271,7 @@ class MemoryUtils { return memory_method_->device_memory_stat_current_value(stat_type, dev_id); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) void GpuMemoryUsage(size_t* available, size_t* total) { CheckMemoryMethod(); PADDLE_ENFORCE_NOT_NULL( @@ -372,7 +372,7 @@ void Copy(const Place& dst_place, int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) void GpuMemoryUsage(size_t* available, size_t* total); #endif diff --git a/paddle/phi/common/place.cc b/paddle/phi/common/place.cc index fe15be4b2b909..0f009806e8c53 100644 --- a/paddle/phi/common/place.cc +++ b/paddle/phi/common/place.cc @@ -123,7 +123,7 @@ static int8_t GetCorrectDeviceIdByPlaceType( switch (place_type) { case paddle::PlaceType::kCPU: return 0; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) case paddle::PlaceType::kGPU: return phi::backends::gpu::GetCurrentDeviceId(); #endif @@ -169,7 +169,7 @@ bool operator==(PlaceType place_type, const Place &place) { GPUPlace DefaultGPUPlace() { return GPUPlace( -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) phi::backends::gpu::GetCurrentDeviceId()); #else 0); diff --git a/paddle/phi/core/compat/convert_utils.cc b/paddle/phi/core/compat/convert_utils.cc index 947c7fb45c5fc..9792f64c5c46d 100644 --- a/paddle/phi/core/compat/convert_utils.cc +++ b/paddle/phi/core/compat/convert_utils.cc @@ -57,7 +57,7 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) { switch (backend) { case phi::Backend::CPU: return phi::CPUPlace(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) case phi::Backend::GPU: return phi::GPUPlace( set_device_id ? phi::backends::gpu::GetCurrentDeviceId() : 0); @@ -66,7 +66,7 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) { case phi::Backend::ONEDNN: return phi::CPUPlace(); #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) case phi::Backend::GPUDNN: return phi::GPUPlace( set_device_id ? phi::backends::gpu::GetCurrentDeviceId() : 0); @@ -77,7 +77,7 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) { set_device_id ? phi::backends::xpu::GetXPUCurrentDeviceId() : 0); #endif case phi::Backend::KPS: -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) return phi::GPUPlace( set_device_id ? phi::backends::gpu::GetCurrentDeviceId() : 0); #elif defined(PADDLE_WITH_XPU_KP) diff --git a/paddle/phi/core/cuda_stream.h b/paddle/phi/core/cuda_stream.h index b27770b081433..58f08a2a36b57 100644 --- a/paddle/phi/core/cuda_stream.h +++ b/paddle/phi/core/cuda_stream.h @@ -28,6 +28,11 @@ using gpuStream_t = cudaStream_t; using gpuStream_t = hipStream_t; #endif +#ifdef PADDLE_WITH_MUSA +#include +using gpuStream_t = musaStream_t; +#endif + #include "glog/logging.h" #include "paddle/phi/core/enforce.h" @@ -73,6 +78,9 @@ class CUDAStream { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreateWithPriority( &stream, static_cast(flag), priority)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamCreateWithPriority( + &stream, static_cast(flag), priority)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreateWithPriority( &stream, static_cast(flag), priority)); @@ -92,6 +100,8 @@ class CUDAStream { backends::gpu::GPUDeviceGuard guard(place_.device); #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(raw_stream())); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamDestroy(raw_stream())); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(raw_stream())); #endif @@ -112,6 +122,14 @@ class CUDAStream { if (err == hipErrorNotReady) { return false; } +#elif defined(PADDLE_WITH_MUSA) + musaError_t err = musaStreamQuery(raw_stream()); + if (err == musaSuccess) { + return true; + } + if (err == musaErrorNotReady) { + return false; + } #else cudaError_t err = cudaStreamQuery(raw_stream()); if (err == cudaSuccess) { diff --git a/paddle/phi/core/enforce.h b/paddle/phi/core/enforce.h index 6b98fd0488595..d4ae30598551c 100644 --- a/paddle/phi/core/enforce.h +++ b/paddle/phi/core/enforce.h @@ -90,7 +90,7 @@ limitations under the License. */ // Note: these headers for simplify demangle type string #include "paddle/phi/core/type_defs.h" // Note: this header for simplify HIP and CUDA type string -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/phi/backends/gpu/gpu_types.h" #endif diff --git a/paddle/phi/core/flags.cc b/paddle/phi/core/flags.cc index 0c581fb09919f..ebb54c8173917 100644 --- a/paddle/phi/core/flags.cc +++ b/paddle/phi/core/flags.cc @@ -14,7 +14,7 @@ // limitations under the License. #include "paddle/phi/core/flags.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/phi/backends/gpu/cuda/cudnn_workspace_helper.h" #endif @@ -120,7 +120,7 @@ PHI_DEFINE_EXPORTED_bool( // NOTE(zhiqiu): better to share the flags, otherwise we will have too many // flags. -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) /** * CUDA related related FLAG @@ -215,7 +215,7 @@ PHI_DEFINE_EXPORTED_bool( true, "Whether enable api kernel fallback to CPU one when not found"); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) /** * CUDNN related FLAG * Name: FLAGS_cudnn_deterministic @@ -322,7 +322,7 @@ PHI_DEFINE_EXPORTED_bool( "batch_norm, default is False."); #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) /** * NCCL related FLAG @@ -541,7 +541,7 @@ PHI_DEFINE_EXPORTED_double( // NOTE(zhiqiu): better to share the flags, otherwise we will have too many // flags. -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \ defined(PADDLE_WITH_CUSTOM_DEVICE) || defined(PADDLE_WITH_XPU) /** @@ -785,7 +785,7 @@ PHI_DEFINE_EXPORTED_string(tracer_mkldnn_ops_off, * Example: * Note: Check kernel launch status after every kernel compute. */ -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PHI_DEFINE_EXPORTED_bool( check_kernel_launch, false, @@ -800,7 +800,7 @@ PHI_DEFINE_EXPORTED_bool( * Example: * Note: Disable cudnn in conv2d. */ -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PHI_DEFINE_EXPORTED_bool(conv2d_disable_cudnn, false, "Disable cudnn in conv2d"); @@ -1127,7 +1127,7 @@ PHI_DEFINE_EXPORTED_bool(gpugraph_debug_gpu_memory, * Example: * Note: nccl blocking wait. */ -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PHI_DEFINE_EXPORTED_bool(nccl_blocking_wait, false, "nccl blocking wait"); #endif diff --git a/paddle/phi/core/generator.cc b/paddle/phi/core/generator.cc index 4ed25af0814df..06ebdc1c0801c 100644 --- a/paddle/phi/core/generator.cc +++ b/paddle/phi/core/generator.cc @@ -63,7 +63,7 @@ const std::shared_ptr& DefaultXPUGenerator(int64_t device_id) { } const std::shared_ptr& DefaultCUDAGenerator(int64_t device_id) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) static int64_t num_cuda_devices = -1; static std::once_flag num_devices_init_flag; @@ -278,7 +278,7 @@ uint64_t Generator::Random64() { std::pair Generator::IncrementOffset( uint64_t increment_offset) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) std::lock_guard lock(this->mu_); uint64_t cur_offset = this->state_.thread_offset; this->state_.thread_offset += increment_offset; diff --git a/paddle/phi/core/kernel_factory.cc b/paddle/phi/core/kernel_factory.cc index 6511efa0152ee..5f9a40625fac9 100644 --- a/paddle/phi/core/kernel_factory.cc +++ b/paddle/phi/core/kernel_factory.cc @@ -120,7 +120,7 @@ const Kernel& KernelFactory::SelectKernelWithGPUDNN( return empty_kernel; } KernelKey kernel_key = KernelKey(const_kernel_key); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (kernel_key.backend() == Backend::GPUDNN) { auto kernel_iter = iter->second.find( {Backend::GPUDNN, phi::DataLayout::ALL_LAYOUT, kernel_key.dtype()}); @@ -221,7 +221,7 @@ KernelResult KernelFactory::SelectKernelOrThrowError( KernelKey kernel_key = KernelKey(const_kernel_key.backend(), phi::DataLayout::ALL_LAYOUT, const_kernel_key.dtype()); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (kernel_key.backend() == Backend::GPUDNN) { auto kernel_iter = iter->second.find( {Backend::GPUDNN, phi::DataLayout::ALL_LAYOUT, kernel_key.dtype()}); diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h index f4e021f7269a7..984b28cf05316 100644 --- a/paddle/phi/core/kernel_registry.h +++ b/paddle/phi/core/kernel_registry.h @@ -60,7 +60,7 @@ struct KernelArgsParseFunctor { #if defined(PADDLE_WITH_MKLDNN) || arg_type == std::type_index(typeid(const OneDNNContext&)) #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || arg_type == std::type_index(typeid(const GPUContext&)) #elif defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP) || arg_type == std::type_index(typeid(const XPUContext&)) @@ -1401,7 +1401,7 @@ struct KernelRegistrar { meta_kernel_fn, \ BACKEND_LIST) -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #define _DEVICE GPU, #elif defined(PADDLE_WITH_XPU) #define _DEVICE XPU, diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h index f4dc4636bdde3..d768ba85272aa 100644 --- a/paddle/phi/core/kernel_utils.h +++ b/paddle/phi/core/kernel_utils.h @@ -273,7 +273,7 @@ struct KernelImpl { /* DeviceContext Helpers */ PD_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(CPUContext); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(GPUContext); #endif #ifdef PADDLE_WITH_XPU diff --git a/paddle/phi/core/mixed_vector.cc b/paddle/phi/core/mixed_vector.cc index 857bd546befcd..aba6a0f7bfca2 100644 --- a/paddle/phi/core/mixed_vector.cc +++ b/paddle/phi/core/mixed_vector.cc @@ -33,7 +33,7 @@ template void CopyToCPUHelper(std::vector *cpu_, phi::Allocator::AllocationPtr *gpu_, size_t *gpu_memory_size_) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) // COPY GPU Data To CPU auto *dev_ctx = static_cast( phi::DeviceContextPool::Instance().Get((*gpu_)->place())); @@ -55,7 +55,7 @@ void CopyCPUDataToCUDAHelper(std::vector *cpu_, phi::Allocator::AllocationPtr *gpu_, size_t *gpu_memory_size_, const phi::Place &place) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) void *src = cpu_->data(); *gpu_memory_size_ = cpu_->size() * sizeof(T); // sizeof(T) (*gpu_) = memory_utils::Alloc(place, *gpu_memory_size_); diff --git a/paddle/phi/core/string_tensor.cc b/paddle/phi/core/string_tensor.cc index 0e465982ba429..98ad70622b943 100644 --- a/paddle/phi/core/string_tensor.cc +++ b/paddle/phi/core/string_tensor.cc @@ -114,7 +114,7 @@ void StringTensor::init_holder() { if (place.GetType() == phi::AllocationType::CPU) { std::memset(ptr, 0, bytes_size); } else if (place.GetType() == phi::AllocationType::GPU) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #ifdef PADDLE_WITH_HIP hipMemset(ptr, 0, bytes_size); #else diff --git a/paddle/phi/core/tensor_utils.cc b/paddle/phi/core/tensor_utils.cc index abe44d3e2550b..b4a1343423103 100644 --- a/paddle/phi/core/tensor_utils.cc +++ b/paddle/phi/core/tensor_utils.cc @@ -58,7 +58,7 @@ void Copy(const Context& dev_ctx, #ifdef PADDLE_WITH_MKLDNN dst->set_layout(src.layout()); #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) } else if (dst_place.GetType() == AllocationType::GPU || dst_place.GetType() == AllocationType::GPUPINNED) { dst_ptr = dev_ctx.Alloc( @@ -99,7 +99,7 @@ void Copy(const Context& dev_ctx, if (src_place.GetType() == AllocationType::CPU && dst_place.GetType() == AllocationType::CPU) { memory_utils::Copy(src_place, dst_ptr, src_place, src_ptr, size); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) } else if ((src_place.GetType() == AllocationType::CPU || src_place.GetType() == AllocationType::GPUPINNED) && // NOLINT (dst_place.GetType() == AllocationType::CPU || @@ -386,7 +386,7 @@ template void Copy(const DeviceContext& dev_ctx, bool blocking, TensorArray* dst); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) template void Copy(const GPUContext& dev_ctx, const DenseTensor& src, Place dst_place, @@ -468,7 +468,7 @@ void TensorFromVector(const std::vector& src, if (dst_place.GetType() == AllocationType::CPU) { memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) else if (dst_place.GetType() == AllocationType::GPU) { // NOLINT memory_utils::Copy(dst_place, dst_ptr, @@ -522,7 +522,7 @@ void TensorFromVector(const std::vector& src, if (dst_place.GetType() == AllocationType::CPU) { memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) else if (dst_place.GetType() == AllocationType::GPU) { // NOLINT memory_utils::Copy(dst_place, dst_ptr, @@ -614,7 +614,7 @@ void TensorFromArray(const T* src, if (dst_place.GetType() == AllocationType::CPU) { memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) else if (dst_place.GetType() == AllocationType::GPU) { // NOLINT memory_utils::Copy(dst_place, dst_ptr, @@ -714,7 +714,7 @@ void TensorToVector(const phi::DenseTensor& src, if (src.place().GetType() == AllocationType::CPU) { memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) else if (src.place().GetType() == AllocationType::GPU) { // NOLINT memory_utils::Copy(dst_place, dst_ptr, @@ -756,7 +756,7 @@ void TensorToVector(const phi::DenseTensor& src, if (src.place().GetType() == AllocationType::CPU) { memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) else if (src.place().GetType() == AllocationType::GPU) { // NOLINT memory_utils::Copy(dst_place, dst_ptr, diff --git a/paddle/phi/core/utils/type_info.cc b/paddle/phi/core/utils/type_info.cc index 2a554525024c8..648ef5c587126 100644 --- a/paddle/phi/core/utils/type_info.cc +++ b/paddle/phi/core/utils/type_info.cc @@ -60,12 +60,12 @@ template class TypeInfoTraits; #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \ defined(PADDLE_WITH_XPU_KP) template class TypeInfoTraits; #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) template class TypeInfoTraits; #endif diff --git a/paddle/phi/core/utils/visit_place.h b/paddle/phi/core/utils/visit_place.h index 6318b17647cd6..34a8fca61fbbe 100644 --- a/paddle/phi/core/utils/visit_place.h +++ b/paddle/phi/core/utils/visit_place.h @@ -25,7 +25,7 @@ typename Visitor::result_type VisitPlace(const phi::Place& place, const Visitor& visitor) { switch (place.GetType()) { case phi::AllocationType::GPU: { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) phi::GPUPlace p(place.GetDeviceId()); return visitor(p); #else @@ -35,7 +35,7 @@ typename Visitor::result_type VisitPlace(const phi::Place& place, #endif } case phi::AllocationType::GPUPINNED: { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) phi::GPUPinnedPlace p; return visitor(p); #else diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index 71bbfaa333a0a..88c273d6934ee 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -858,7 +858,7 @@ void CoalesceTensorInferMeta(const std::vector& input, size_of_dtype = phi::SizeOf(dtype); } if (config.is_runtime) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) int64_t numel = 0; for (size_t i = 0; i < input.size(); ++i) { const auto& dim = input[i]->dims(); diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index 25367be206139..e28210cfca7e4 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -117,7 +117,7 @@ file( "xpu/*.cc" "legacy/xpu/*.cc" "selected_rows/xpu/*.cc" "fusion/xpu/*.cc" "sparse/xpu/*.cc") -if(WITH_GPU OR WITH_ROCM) +if(WITH_GPU OR WITH_ROCM OR WITH_MUSA) collect_srcs(kernels_srcs SRCS ${kernel_cu}) kernel_declare("${kernel_cu}") endif() diff --git a/paddle/phi/kernels/activation_kernel.cc b/paddle/phi/kernels/activation_kernel.cc index f157c5e054bfb..9626621ae8657 100644 --- a/paddle/phi/kernels/activation_kernel.cc +++ b/paddle/phi/kernels/activation_kernel.cc @@ -32,7 +32,7 @@ using complex128 = ::phi::dtype::complex; PD_REGISTER_KERNEL(relu6, CPU, ALL_LAYOUT, phi::Relu6Kernel, float, double) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(relu6, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/assign_kernel.cc b/paddle/phi/kernels/assign_kernel.cc index db30ec7389619..c44b6333154cc 100644 --- a/paddle/phi/kernels/assign_kernel.cc +++ b/paddle/phi/kernels/assign_kernel.cc @@ -135,7 +135,7 @@ PD_REGISTER_KERNEL(assign_value, int8_t, int64_t) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL_FOR_ALL_DTYPE(assign, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/check_memory_continue_kernel.cc b/paddle/phi/kernels/check_memory_continue_kernel.cc index 6e496a355302f..9f4b51281cd37 100644 --- a/paddle/phi/kernels/check_memory_continue_kernel.cc +++ b/paddle/phi/kernels/check_memory_continue_kernel.cc @@ -88,7 +88,7 @@ PD_REGISTER_KERNEL(check_memory_continue, float, double) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(check_memory_continue, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/dist_grad_kernel.cc b/paddle/phi/kernels/dist_grad_kernel.cc index 17c24fa905b5c..442290c3648e2 100644 --- a/paddle/phi/kernels/dist_grad_kernel.cc +++ b/paddle/phi/kernels/dist_grad_kernel.cc @@ -97,7 +97,7 @@ void DistGradKernel(const Context& dev_ctx, PD_REGISTER_KERNEL( dist_grad, CPU, ALL_LAYOUT, phi::DistGradKernel, float, double) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL( dist_grad, GPU, ALL_LAYOUT, phi::DistGradKernel, float, double) {} #endif diff --git a/paddle/phi/kernels/empty_kernel.cc b/paddle/phi/kernels/empty_kernel.cc index 8df5e9a543eb2..54449200ae4b2 100644 --- a/paddle/phi/kernels/empty_kernel.cc +++ b/paddle/phi/kernels/empty_kernel.cc @@ -74,7 +74,7 @@ PD_REGISTER_KERNEL(empty_like, kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(empty, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/flatten_grad_kernel.cc b/paddle/phi/kernels/flatten_grad_kernel.cc index 42d137ba4f419..3ecef871d211d 100644 --- a/paddle/phi/kernels/flatten_grad_kernel.cc +++ b/paddle/phi/kernels/flatten_grad_kernel.cc @@ -46,7 +46,7 @@ PD_REGISTER_KERNEL(flatten_grad, int, int64_t) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(flatten_grad, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/flatten_kernel.cc b/paddle/phi/kernels/flatten_kernel.cc index dc61e6a650efa..6b22ac7518179 100644 --- a/paddle/phi/kernels/flatten_kernel.cc +++ b/paddle/phi/kernels/flatten_kernel.cc @@ -75,7 +75,7 @@ PD_REGISTER_KERNEL(flatten, int, int64_t) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(flatten_infer, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/full_kernel.cc b/paddle/phi/kernels/full_kernel.cc index 38beafbfa51b9..982b6a396c2a8 100644 --- a/paddle/phi/kernels/full_kernel.cc +++ b/paddle/phi/kernels/full_kernel.cc @@ -49,7 +49,7 @@ PD_REGISTER_KERNEL(full_batch_size_like, bool) { kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(full_batch_size_like, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt index 999625cf3dfb4..c4bdf29e03949 100644 --- a/paddle/phi/kernels/funcs/CMakeLists.txt +++ b/paddle/phi/kernels/funcs/CMakeLists.txt @@ -8,7 +8,7 @@ file( GLOB func_cc_srcs RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc") -if(WITH_GPU OR WITH_ROCM) +if(WITH_GPU OR WITH_ROCM OR WITH_MUSA) file( GLOB func_cu_srcs RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" diff --git a/paddle/phi/kernels/funcs/blas/blas.h b/paddle/phi/kernels/funcs/blas/blas.h index 140eca890480f..2ea7a306f16fd 100644 --- a/paddle/phi/kernels/funcs/blas/blas.h +++ b/paddle/phi/kernels/funcs/blas/blas.h @@ -360,7 +360,7 @@ class Blas { T* B, int ldb) const; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) template void BatchedGETRF(int n, T** a, int* ipiv, int* info, int batch_size) const; @@ -543,7 +543,7 @@ class BlasT : private Blas { Base()->template TRSM(args...); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) template void BatchedGETRF(ARGS... args) const { Base()->template BatchedGETRF(args...); diff --git a/paddle/phi/kernels/funcs/detail/strided_memcpy.h b/paddle/phi/kernels/funcs/detail/strided_memcpy.h index 0cd07fdfd0e1a..d731c4f89b751 100644 --- a/paddle/phi/kernels/funcs/detail/strided_memcpy.h +++ b/paddle/phi/kernels/funcs/detail/strided_memcpy.h @@ -17,7 +17,7 @@ limitations under the License. */ #include "paddle/phi/core/ddim.h" #include "paddle/phi/core/device_context.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/phi/backends/gpu/gpu_context.h" #endif @@ -41,7 +41,7 @@ struct StridedMemcpyFunctor { auto& cpu_place = place; memory_utils::Copy(cpu_place, dst, cpu_place, src, sizeof(T)); } else { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) auto& gpu_place = place; auto& cuda_ctx = reinterpret_cast(dev_ctx); memory_utils::Copy( @@ -68,7 +68,7 @@ struct StridedMemcpyFunctor { memory_utils::Copy( cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim[0]); } else { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) auto& gpu_place = place; auto& cuda_ctx = reinterpret_cast(dev_ctx); memory_utils::Copy(gpu_place, diff --git a/paddle/phi/kernels/funcs/layer_norm_util.h b/paddle/phi/kernels/funcs/layer_norm_util.h index 7f7b2be551a57..7a4ea0bb695bd 100644 --- a/paddle/phi/kernels/funcs/layer_norm_util.h +++ b/paddle/phi/kernels/funcs/layer_norm_util.h @@ -36,7 +36,7 @@ struct RowwiseMean2D { DenseTensor* vec); }; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) template class RowwiseMean2D { public: @@ -93,7 +93,7 @@ struct ColwiseSum2D { DenseTensor* vec); }; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) template class ColwiseSum2D { public: diff --git a/paddle/phi/kernels/funcs/math_function.cc b/paddle/phi/kernels/funcs/math_function.cc index 10d18cc958ae3..7c2fd866e3b91 100644 --- a/paddle/phi/kernels/funcs/math_function.cc +++ b/paddle/phi/kernels/funcs/math_function.cc @@ -239,7 +239,7 @@ void set_constant(const phi::DeviceContext& context, return; } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) // tensor->place().apply_visitor(func); phi::VisitPlace(tensor->place(), func); #elif defined(PADDLE_WITH_XPU) diff --git a/paddle/phi/kernels/funcs/math_function.h b/paddle/phi/kernels/funcs/math_function.h index b42714e80db2f..bce782049b8a8 100644 --- a/paddle/phi/kernels/funcs/math_function.h +++ b/paddle/phi/kernels/funcs/math_function.h @@ -25,7 +25,7 @@ limitations under the License. */ namespace phi { namespace funcs { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) template void BatchTranspose(T* output, const T* input, diff --git a/paddle/phi/kernels/funcs/pooling.h b/paddle/phi/kernels/funcs/pooling.h index bf2409d2e502b..3d95ef45eaae6 100644 --- a/paddle/phi/kernels/funcs/pooling.h +++ b/paddle/phi/kernels/funcs/pooling.h @@ -23,7 +23,7 @@ limitations under the License. */ #include "paddle/phi/core/hostdevice.h" #include "paddle/phi/core/macros.h" // import FLT_MAX -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/phi/backends/gpu/gpu_decls.h" #endif @@ -115,7 +115,7 @@ HOSTDEVICE inline int AdaptEndIndex(int ph, int input_size, int output_size) { * This is different from average pooling. So we rewrite the max_pool_grad: * MaxPool2dGradFunctor, MaxPool3dGradFunctor. */ -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) template class Pool2dDirectCUDAFunctor { public: @@ -211,7 +211,7 @@ class MaxPool2dGradFunctor { DenseTensor* input_grad); }; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) template class Pool3dDirectCUDAFunctor { public: diff --git a/paddle/phi/kernels/funcs/select_impl.cu.h b/paddle/phi/kernels/funcs/select_impl.cu.h index 96b7942cf2709..2976968d07b70 100644 --- a/paddle/phi/kernels/funcs/select_impl.cu.h +++ b/paddle/phi/kernels/funcs/select_impl.cu.h @@ -15,7 +15,7 @@ #pragma once // CUDA and HIP use same api -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #ifdef __NVCC__ #include "cub/cub.cuh" #endif diff --git a/paddle/phi/kernels/funcs/softmax.h b/paddle/phi/kernels/funcs/softmax.h index 80805eb6d76f6..1198b80a9e879 100644 --- a/paddle/phi/kernels/funcs/softmax.h +++ b/paddle/phi/kernels/funcs/softmax.h @@ -37,7 +37,7 @@ class SoftmaxGradFunctor { phi::DenseTensor* x_grad); }; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) template class SoftmaxCUDNNFunctor { public: diff --git a/paddle/phi/kernels/funcs/strided_memcpy.h b/paddle/phi/kernels/funcs/strided_memcpy.h index de38e40d317e1..0e9dc896c3629 100644 --- a/paddle/phi/kernels/funcs/strided_memcpy.h +++ b/paddle/phi/kernels/funcs/strided_memcpy.h @@ -56,7 +56,7 @@ inline void CopyWithContext(const Context& ctx, const Place& src_place, const void* src, size_t num) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \ defined(PADDLE_WITH_CUSTOM_DEVICE) memory_utils::Copy(dst_place, dst, src_place, src, num, ctx.stream()); #else diff --git a/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_utils.h b/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_utils.h index c1d60cbffee2f..418fa8bf55ce9 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_utils.h +++ b/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_utils.h @@ -25,7 +25,7 @@ #include "paddle/phi/kernels/funcs/aligned_vector.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #ifdef PADDLE_WITH_HIP #define WARP_SIZE 64 diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu index 1a07e5f0d4909..5c2d76be35992 100644 --- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu @@ -611,7 +611,7 @@ void BatchNormKernel(const Context &ctx, } epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) // TODO(wangran16): wait for MIOpen to improve the performance of BN // mode_ = miopenBNSpatial; #elif CUDNN_VERSION_MIN(7, 0, 1) diff --git a/paddle/phi/kernels/gpu/reduce.h b/paddle/phi/kernels/gpu/reduce.h index cc3cad38f46fb..428f105c9743a 100644 --- a/paddle/phi/kernels/gpu/reduce.h +++ b/paddle/phi/kernels/gpu/reduce.h @@ -15,7 +15,7 @@ #pragma once // CUDA and HIP use same api -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \ defined(PADDLE_WITH_XPU_KP) #include "paddle/phi/core/visit_type.h" diff --git a/paddle/phi/kernels/gpu/reduce_grad.h b/paddle/phi/kernels/gpu/reduce_grad.h index 7e01c1ae84391..c3c918c21cb35 100644 --- a/paddle/phi/kernels/gpu/reduce_grad.h +++ b/paddle/phi/kernels/gpu/reduce_grad.h @@ -15,7 +15,7 @@ #pragma once // CUDA and HIP use same api -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include #include diff --git a/paddle/phi/kernels/group_norm_kernel.h b/paddle/phi/kernels/group_norm_kernel.h index f3e39ddbeb328..ec134fa47eecd 100644 --- a/paddle/phi/kernels/group_norm_kernel.h +++ b/paddle/phi/kernels/group_norm_kernel.h @@ -33,7 +33,7 @@ void GroupNormKernel(const Context& dev_ctx, DenseTensor* mean, DenseTensor* variance); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) template class GroupNormDirectCUDAFunctor { public: diff --git a/paddle/phi/kernels/impl/segment_pool_kernel_impl.h b/paddle/phi/kernels/impl/segment_pool_kernel_impl.h index 216d5e6100d6c..82b99b07a8927 100644 --- a/paddle/phi/kernels/impl/segment_pool_kernel_impl.h +++ b/paddle/phi/kernels/impl/segment_pool_kernel_impl.h @@ -64,7 +64,7 @@ void SegmentKernelLaunchHelper(const Context& dev_ctx, phi::funcs::SetConstant set_zero; set_zero(dev_ctx, out, static_cast(0)); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (!cpu_place) { DenseTensor length; length.Resize(phi::make_ddim({1})); diff --git a/paddle/phi/kernels/impl/warpctc_kernel_impl.h b/paddle/phi/kernels/impl/warpctc_kernel_impl.h index 4b4bd6f5143dd..015c7a0764a2b 100644 --- a/paddle/phi/kernels/impl/warpctc_kernel_impl.h +++ b/paddle/phi/kernels/impl/warpctc_kernel_impl.h @@ -205,7 +205,7 @@ class WarpCTCFunctor { warpctc_version_ = phi::dynload::get_warpctc_version(); if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) options_.loc = CTC_GPU; options_.stream = reinterpret_cast(dev_ctx).stream(); diff --git a/paddle/phi/kernels/impl/warprnnt_kernel_impl.h b/paddle/phi/kernels/impl/warprnnt_kernel_impl.h index f51041285aaee..f36ec9c007eda 100644 --- a/paddle/phi/kernels/impl/warprnnt_kernel_impl.h +++ b/paddle/phi/kernels/impl/warprnnt_kernel_impl.h @@ -139,7 +139,7 @@ class WarpRNNTFunctor { rnntStatus_t status = RNNT_STATUS_UNKNOWN_ERROR; bool gpu = false; if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) gpu = true; #else PADDLE_THROW(errors::PreconditionNotMet( @@ -208,7 +208,7 @@ class WarpRNNTFunctor { options_.batch_first = true; if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) options_.loc = RNNT_GPU; options_.stream = reinterpret_cast(dev_ctx).stream(); diff --git a/paddle/phi/kernels/is_empty_kernel.cc b/paddle/phi/kernels/is_empty_kernel.cc index 4b86f2dfe6950..f420a419f5c67 100644 --- a/paddle/phi/kernels/is_empty_kernel.cc +++ b/paddle/phi/kernels/is_empty_kernel.cc @@ -43,7 +43,7 @@ PD_REGISTER_KERNEL(is_empty, kernel->OutputAt(0).SetDataType(phi::DataType::BOOL); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(is_empty, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/kps/elementwise_kernel.cu b/paddle/phi/kernels/kps/elementwise_kernel.cu index e88714c370be9..d72d051ba1bf8 100644 --- a/paddle/phi/kernels/kps/elementwise_kernel.cu +++ b/paddle/phi/kernels/kps/elementwise_kernel.cu @@ -87,7 +87,7 @@ void ElementwisePowKernel(const Context& dev_ctx, } // namespace phi -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(maximum, KPS, diff --git a/paddle/phi/kernels/layer_norm_kernel.h b/paddle/phi/kernels/layer_norm_kernel.h index 2fddcec2278c9..ee8a324e09b4f 100644 --- a/paddle/phi/kernels/layer_norm_kernel.h +++ b/paddle/phi/kernels/layer_norm_kernel.h @@ -30,7 +30,7 @@ void LayerNormKernel(const Context& ctx, DenseTensor* mean, DenseTensor* variance); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) template class LayerNormDirectCUDAFunctor { public: diff --git a/paddle/phi/kernels/memcpy_kernel.cc b/paddle/phi/kernels/memcpy_kernel.cc index 49d69a23fedd1..62a6cbc8ea840 100644 --- a/paddle/phi/kernels/memcpy_kernel.cc +++ b/paddle/phi/kernels/memcpy_kernel.cc @@ -117,7 +117,7 @@ void MemcpyKernel(const Context& dev_ctx, dev_ctx.HostAlloc(out, out->dtype()); Copy(dev_ctx, x, CPUPlace(), true, out); break; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) case 1: /* CUDAPlace */ dev_ctx.Alloc(out, x.dtype()); Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out); @@ -162,7 +162,7 @@ PD_REGISTER_KERNEL_FOR_ALL_DTYPE(memcpy, kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL_FOR_ALL_DTYPE(memcpy_h2d, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/npu_identity_kernel.cc b/paddle/phi/kernels/npu_identity_kernel.cc index 89a0c63c8a495..12d933af78733 100644 --- a/paddle/phi/kernels/npu_identity_kernel.cc +++ b/paddle/phi/kernels/npu_identity_kernel.cc @@ -62,7 +62,7 @@ PD_REGISTER_KERNEL(npu_identity, bool, phi::dtype::float16) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(npu_identity, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/prod_kernel.cc b/paddle/phi/kernels/prod_kernel.cc index ea3faaebd9582..4e5546ca0df01 100644 --- a/paddle/phi/kernels/prod_kernel.cc +++ b/paddle/phi/kernels/prod_kernel.cc @@ -40,7 +40,7 @@ PD_REGISTER_KERNEL(prod_infer, int, int64_t) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(prod_infer, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/reduce_all_kernel.cc b/paddle/phi/kernels/reduce_all_kernel.cc index 9e799f0d219fc..3b33c7f665e79 100644 --- a/paddle/phi/kernels/reduce_all_kernel.cc +++ b/paddle/phi/kernels/reduce_all_kernel.cc @@ -40,7 +40,7 @@ void AllKernel(const Context& dev_ctx, PD_REGISTER_KERNEL(all, CPU, ALL_LAYOUT, phi::AllKernel, bool) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(all, GPU, ALL_LAYOUT, phi::AllKernel, bool) {} #endif diff --git a/paddle/phi/kernels/reduce_amax_kernel.cc b/paddle/phi/kernels/reduce_amax_kernel.cc index 87e432c5c20a7..466d0497b2d8e 100644 --- a/paddle/phi/kernels/reduce_amax_kernel.cc +++ b/paddle/phi/kernels/reduce_amax_kernel.cc @@ -34,7 +34,7 @@ void AMaxKernel(const Context& dev_ctx, PD_REGISTER_KERNEL( amax, CPU, ALL_LAYOUT, phi::AMaxKernel, float, double, int, int64_t) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL( amax, GPU, ALL_LAYOUT, phi::AMaxKernel, float, double, int, int64_t) {} #endif diff --git a/paddle/phi/kernels/reduce_amin_kernel.cc b/paddle/phi/kernels/reduce_amin_kernel.cc index a355da64230dc..a30ab4a91956d 100644 --- a/paddle/phi/kernels/reduce_amin_kernel.cc +++ b/paddle/phi/kernels/reduce_amin_kernel.cc @@ -34,7 +34,7 @@ void AMinKernel(const Context& dev_ctx, PD_REGISTER_KERNEL( amin, CPU, ALL_LAYOUT, phi::AMinKernel, float, double, int, int64_t) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL( amin, GPU, ALL_LAYOUT, phi::AMinKernel, float, double, int, int64_t) {} #endif diff --git a/paddle/phi/kernels/reduce_any_kernel.cc b/paddle/phi/kernels/reduce_any_kernel.cc index 9d162f8e02033..0b6f4028b62ac 100644 --- a/paddle/phi/kernels/reduce_any_kernel.cc +++ b/paddle/phi/kernels/reduce_any_kernel.cc @@ -33,7 +33,7 @@ void AnyKernel(const Context& dev_ctx, PD_REGISTER_KERNEL(any, CPU, ALL_LAYOUT, phi::AnyKernel, bool) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(any, GPU, ALL_LAYOUT, phi::AnyKernel, bool) {} #endif diff --git a/paddle/phi/kernels/reduce_mean_kernel.cc b/paddle/phi/kernels/reduce_mean_kernel.cc index 21b02412d31ca..fb8ea2f97bbea 100644 --- a/paddle/phi/kernels/reduce_mean_kernel.cc +++ b/paddle/phi/kernels/reduce_mean_kernel.cc @@ -41,7 +41,7 @@ PD_REGISTER_KERNEL(mean, phi::dtype::complex, phi::dtype::complex) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(mean, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/reduce_sum_kernel.cc b/paddle/phi/kernels/reduce_sum_kernel.cc index de9688d4e60aa..59d192014da1d 100644 --- a/paddle/phi/kernels/reduce_sum_kernel.cc +++ b/paddle/phi/kernels/reduce_sum_kernel.cc @@ -53,7 +53,7 @@ PD_REGISTER_KERNEL(sum, kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(sum, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/reverse_kernel.cc b/paddle/phi/kernels/reverse_kernel.cc index 771acacedf024..d8c8f5a966376 100644 --- a/paddle/phi/kernels/reverse_kernel.cc +++ b/paddle/phi/kernels/reverse_kernel.cc @@ -61,7 +61,7 @@ PD_REGISTER_KERNEL(reverse_array, float, double) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(reverse_array, GPU, diff --git a/paddle/phi/kernels/selected_rows/activation_kernel.cc b/paddle/phi/kernels/selected_rows/activation_kernel.cc index 4a27d0763a235..6bd55f701bb33 100644 --- a/paddle/phi/kernels/selected_rows/activation_kernel.cc +++ b/paddle/phi/kernels/selected_rows/activation_kernel.cc @@ -49,7 +49,7 @@ PD_REGISTER_KERNEL( PD_REGISTER_KERNEL( sqrt_sr, CPU, ALL_LAYOUT, phi::sr::SqrtKernel, float, double) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(square_sr, GPU, diff --git a/paddle/phi/kernels/selected_rows/assign_kernel.cc b/paddle/phi/kernels/selected_rows/assign_kernel.cc index 081d85e68c959..481f5f6fcf852 100644 --- a/paddle/phi/kernels/selected_rows/assign_kernel.cc +++ b/paddle/phi/kernels/selected_rows/assign_kernel.cc @@ -41,7 +41,7 @@ PD_REGISTER_KERNEL_FOR_ALL_DTYPE(assign_sr, kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL_FOR_ALL_DTYPE(assign_sr, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.cc b/paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.cc index dccbba6947a1b..0ea7fbe8857c4 100644 --- a/paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.cc +++ b/paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.cc @@ -84,7 +84,7 @@ PD_REGISTER_KERNEL(multiply_sr, complex64, complex128) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(multiply_raw_sr, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/selected_rows/full_kernel.cc b/paddle/phi/kernels/selected_rows/full_kernel.cc index e04139448dddc..b593e6db3f936 100644 --- a/paddle/phi/kernels/selected_rows/full_kernel.cc +++ b/paddle/phi/kernels/selected_rows/full_kernel.cc @@ -15,7 +15,7 @@ limitations under the License. */ #include "paddle/phi/kernels/selected_rows/full_kernel.h" #include "paddle/phi/backends/cpu/cpu_context.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/phi/backends/gpu/gpu_context.h" #endif #include "paddle/phi/common/bfloat16.h" @@ -54,7 +54,7 @@ PD_REGISTER_KERNEL(full_sr, phi::dtype::complex, phi::dtype::complex) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(full_sr, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/selected_rows/isfinite_kernel.cc b/paddle/phi/kernels/selected_rows/isfinite_kernel.cc index d68688a7e400a..e3489f50e2184 100644 --- a/paddle/phi/kernels/selected_rows/isfinite_kernel.cc +++ b/paddle/phi/kernels/selected_rows/isfinite_kernel.cc @@ -15,7 +15,7 @@ #include "paddle/phi/kernels/selected_rows/isfinite_kernel.h" #include "paddle/phi/backends/cpu/cpu_context.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/phi/backends/gpu/gpu_context.h" #endif #include "paddle/phi/core/kernel_registry.h" @@ -51,7 +51,7 @@ PD_REGISTER_KERNEL(isfinite_sr, int, int64_t) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(isinf_sr, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/selected_rows/merge_selected_rows_kernel.cc b/paddle/phi/kernels/selected_rows/merge_selected_rows_kernel.cc index a5d2e66787316..7b6f7e9ceefa4 100644 --- a/paddle/phi/kernels/selected_rows/merge_selected_rows_kernel.cc +++ b/paddle/phi/kernels/selected_rows/merge_selected_rows_kernel.cc @@ -41,7 +41,7 @@ PD_REGISTER_KERNEL(merge_selected_rows, float, double) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(merge_selected_rows, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/selected_rows/scale_kernel.cc b/paddle/phi/kernels/selected_rows/scale_kernel.cc index 38a0cb75101b7..f6f9d587c4022 100644 --- a/paddle/phi/kernels/selected_rows/scale_kernel.cc +++ b/paddle/phi/kernels/selected_rows/scale_kernel.cc @@ -54,7 +54,7 @@ PD_REGISTER_KERNEL(scale_sr, int, int64_t) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(scale_sr, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/selected_rows/shape_kernel.cc b/paddle/phi/kernels/selected_rows/shape_kernel.cc index f44a6a8dfafc5..0a07bee7b6974 100644 --- a/paddle/phi/kernels/selected_rows/shape_kernel.cc +++ b/paddle/phi/kernels/selected_rows/shape_kernel.cc @@ -52,7 +52,7 @@ PD_REGISTER_KERNEL(shape_sr, kernel->OutputAt(0).SetDataType(phi::DataType::INT32); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(shape_sr, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/selected_rows/uniform_kernel.cc b/paddle/phi/kernels/selected_rows/uniform_kernel.cc index 0af5d8788c71f..90bee1744e962 100644 --- a/paddle/phi/kernels/selected_rows/uniform_kernel.cc +++ b/paddle/phi/kernels/selected_rows/uniform_kernel.cc @@ -77,7 +77,7 @@ PD_REGISTER_KERNEL(uniform_sr, double, phi::dtype::bfloat16) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(uniform_raw_sr, GPU, diff --git a/paddle/phi/kernels/shape_kernel.cc b/paddle/phi/kernels/shape_kernel.cc index c4190a5f59b62..e7556d1401954 100644 --- a/paddle/phi/kernels/shape_kernel.cc +++ b/paddle/phi/kernels/shape_kernel.cc @@ -51,7 +51,7 @@ PD_REGISTER_KERNEL(shape, kernel->OutputAt(0).SetDataType(phi::DataType::INT32); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(shape, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/sparse/empty_kernel.cc b/paddle/phi/kernels/sparse/empty_kernel.cc index 49a377ca70f67..44ccdd3bda634 100644 --- a/paddle/phi/kernels/sparse/empty_kernel.cc +++ b/paddle/phi/kernels/sparse/empty_kernel.cc @@ -82,7 +82,7 @@ PD_REGISTER_KERNEL(empty_like_csr, kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(empty_like_coo, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc b/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc index 064867610d719..8e9ed654760f3 100644 --- a/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc +++ b/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc @@ -81,7 +81,7 @@ PD_REGISTER_KERNEL(sparse_coo_tensor_grad, kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_COO); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(values_coo_grad, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/squeeze_grad_kernel.cc b/paddle/phi/kernels/squeeze_grad_kernel.cc index 473acf9d7a1d1..3eab4daf5740a 100644 --- a/paddle/phi/kernels/squeeze_grad_kernel.cc +++ b/paddle/phi/kernels/squeeze_grad_kernel.cc @@ -49,7 +49,7 @@ PD_REGISTER_KERNEL(squeeze_grad, phi::dtype::complex, phi::dtype::complex) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(squeeze_grad, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/squeeze_kernel.cc b/paddle/phi/kernels/squeeze_kernel.cc index d495b040921b5..933540cd787e4 100644 --- a/paddle/phi/kernels/squeeze_kernel.cc +++ b/paddle/phi/kernels/squeeze_kernel.cc @@ -74,7 +74,7 @@ PD_REGISTER_KERNEL(squeeze, int64_t, phi::dtype::complex, phi::dtype::complex) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(squeeze_infer, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/strided_slice_grad_kernel.cc b/paddle/phi/kernels/strided_slice_grad_kernel.cc index 7582f751bf16a..dd5bd42a3f48a 100644 --- a/paddle/phi/kernels/strided_slice_grad_kernel.cc +++ b/paddle/phi/kernels/strided_slice_grad_kernel.cc @@ -55,7 +55,7 @@ PD_REGISTER_KERNEL(strided_slice_grad, phi::dtype::bfloat16, phi::dtype::complex, phi::dtype::complex) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(strided_slice_grad, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/strided_slice_kernel.cc b/paddle/phi/kernels/strided_slice_kernel.cc index 68377dbe8468e..79e43de25e9a8 100644 --- a/paddle/phi/kernels/strided_slice_kernel.cc +++ b/paddle/phi/kernels/strided_slice_kernel.cc @@ -46,7 +46,7 @@ PD_REGISTER_KERNEL(strided_slice, phi::dtype::bfloat16, phi::dtype::complex, phi::dtype::complex) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(strided_slice, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/strings/gpu/copy_utils.h b/paddle/phi/kernels/strings/gpu/copy_utils.h index 36cad02618424..a6c2aba97b5e8 100644 --- a/paddle/phi/kernels/strings/gpu/copy_utils.h +++ b/paddle/phi/kernels/strings/gpu/copy_utils.h @@ -23,7 +23,7 @@ limitations under the License. */ namespace phi { namespace strings { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) __global__ void SerializeStringsData(const phi::dtype::pstring* src_str, uint8_t* strings_data, int32_t* strings_offset, @@ -146,7 +146,7 @@ void DeserializeOnCPU(const Context& dev_ctx, } } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) void SerializeOnGPU(const phi::GPUContext& dev_ctx, const StringTensor& src, DenseTensor* dst) { diff --git a/paddle/phi/kernels/strings/strings_empty_kernel.cc b/paddle/phi/kernels/strings/strings_empty_kernel.cc index 22a43ceaff1c1..60a75584587d3 100644 --- a/paddle/phi/kernels/strings/strings_empty_kernel.cc +++ b/paddle/phi/kernels/strings/strings_empty_kernel.cc @@ -49,7 +49,7 @@ PD_REGISTER_KERNEL_FOR_ALL_DTYPE( ALL_LAYOUT, phi::strings::EmptyLikeKernel) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL_FOR_ALL_DTYPE(strings_empty, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/strings/unicode.cc b/paddle/phi/kernels/strings/unicode.cc index 9f636809de876..75e48f1ce982e 100644 --- a/paddle/phi/kernels/strings/unicode.cc +++ b/paddle/phi/kernels/strings/unicode.cc @@ -46,7 +46,7 @@ const uint16_t* GetCharcasesMap() { return reinterpret_cast(utils_map[0]); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) const uint8_t* GetGPUUniflagMap() { if (utils_map[3] == nullptr) { diff --git a/paddle/phi/kernels/strings/unicode.h b/paddle/phi/kernels/strings/unicode.h index 45e41b72d086c..89ec9efa15189 100644 --- a/paddle/phi/kernels/strings/unicode.h +++ b/paddle/phi/kernels/strings/unicode.h @@ -188,7 +188,7 @@ HOSTDEVICE inline void GetUTF8Str(const uint32_t* unicode_str, const uint8_t* GetUniFlagMap(); const uint16_t* GetCharcasesMap(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) const uint8_t* GetGPUUniflagMap(); const uint16_t* GetGPUCharcasesMap(); diff --git a/paddle/phi/kernels/transfer_layout_kernel.cc b/paddle/phi/kernels/transfer_layout_kernel.cc index 84b978436e163..5ee69e5964918 100644 --- a/paddle/phi/kernels/transfer_layout_kernel.cc +++ b/paddle/phi/kernels/transfer_layout_kernel.cc @@ -71,7 +71,7 @@ void TransferLayoutGeneral(const Context& dev_ctx, out->Resize(phi::make_ddim(dst_dim)); dev_ctx.Alloc(out, x.dtype()); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) // In GPU fp16 model, we will insert many transfer_layout ops in // conv2d_fusion_layout_transfer_pass, so we optimize this kernel on GPU if (std::is_same::value) { @@ -221,7 +221,7 @@ PD_REGISTER_KERNEL_FOR_ALL_DTYPE(transfer_layout, CPU, ALL_LAYOUT, phi::TransferLayoutKernel) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL_FOR_ALL_DTYPE(transfer_layout, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/unsqueeze_grad_kernel.cc b/paddle/phi/kernels/unsqueeze_grad_kernel.cc index 3c119db2c73d6..e294c3a983769 100644 --- a/paddle/phi/kernels/unsqueeze_grad_kernel.cc +++ b/paddle/phi/kernels/unsqueeze_grad_kernel.cc @@ -49,7 +49,7 @@ PD_REGISTER_KERNEL(unsqueeze_grad, phi::dtype::complex, phi::dtype::complex) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(unsqueeze_grad, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/unsqueeze_kernel.cc b/paddle/phi/kernels/unsqueeze_kernel.cc index c08c31da4ef0c..6e03176857e4c 100644 --- a/paddle/phi/kernels/unsqueeze_kernel.cc +++ b/paddle/phi/kernels/unsqueeze_kernel.cc @@ -80,7 +80,7 @@ PD_REGISTER_KERNEL(unsqueeze, int64_t, phi::dtype::complex, phi::dtype::complex) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(unsqueeze_infer, GPU, ALL_LAYOUT, diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc index 7295e86182734..c3ca58e0e4a94 100644 --- a/paddle/testing/paddle_gtest_main.cc +++ b/paddle/testing/paddle_gtest_main.cc @@ -19,7 +19,7 @@ limitations under the License. */ #include "paddle/fluid/platform/init.h" #include "paddle/phi/core/flags.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) DECLARE_bool(enable_gpu_memory_usage_log); #endif @@ -84,7 +84,7 @@ int main(int argc, char** argv) { VLOG(1) << "gtest undefok_string:" << undefok_string; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) if (strstr(undefok_str, "enable_gpu_memory_usage_log")) { VLOG(1) << "Set FLAGS_enable_gpu_memory_usage_log to true"; FLAGS_enable_gpu_memory_usage_log = true; From a705c264bf5c5b213e3def54afd047df29f6e889 Mon Sep 17 00:00:00 2001 From: HanHaowen Date: Tue, 25 Jul 2023 16:20:25 +0800 Subject: [PATCH 02/55] add musa macro in phi folder except phi/kernels folder --- paddle/phi/api/include/tensor.h | 5 + paddle/phi/api/profiler/event.h | 23 ++++ paddle/phi/backends/device_code.cc | 125 ++++++++++++++++++ paddle/phi/backends/device_code.h | 9 ++ paddle/phi/backends/dynload/dynamic_loader.cc | 5 + paddle/phi/backends/gpu/gpu_context.cc | 59 +++++++++ paddle/phi/backends/gpu/gpu_decls.h | 4 + paddle/phi/backends/gpu/gpu_device_function.h | 2 + paddle/phi/backends/gpu/gpu_dnn.h | 3 + paddle/phi/backends/gpu/gpu_helper.h | 2 + paddle/phi/backends/gpu/gpu_primitives.h | 3 + paddle/phi/backends/gpu/gpu_resources.cc | 88 +++++++++++- paddle/phi/backends/gpu/gpu_types.h | 10 +- .../backends/gpu/rocm/rocm_device_function.h | 2 + paddle/phi/common/bfloat16.h | 17 +++ paddle/phi/common/complex.h | 14 ++ paddle/phi/common/float16.h | 4 + paddle/phi/core/cuda_stream.h | 9 ++ paddle/phi/core/enforce.h | 24 ++++ paddle/phi/core/string_tensor.cc | 2 + 20 files changed, 408 insertions(+), 2 deletions(-) diff --git a/paddle/phi/api/include/tensor.h b/paddle/phi/api/include/tensor.h index 4224aeae2b5c3..b2c687a1f448d 100644 --- a/paddle/phi/api/include/tensor.h +++ b/paddle/phi/api/include/tensor.h @@ -29,6 +29,11 @@ using gpuStream_t = cudaStream_t; using gpuStream_t = hipStream_t; #endif +#ifdef PADDLE_WITH_MUSA +#include +using gpuStream_t = musaStream_t; +#endif + #include "paddle/phi/api/include/dll_decl.h" #include "paddle/phi/common/data_type.h" #include "paddle/phi/common/int_array.h" diff --git a/paddle/phi/api/profiler/event.h b/paddle/phi/api/profiler/event.h index ebd613e4a8099..3a789cad101f4 100644 --- a/paddle/phi/api/profiler/event.h +++ b/paddle/phi/api/profiler/event.h @@ -27,6 +27,9 @@ limitations under the License. */ #ifdef PADDLE_WITH_HIP #include #endif +#ifdef PADDLE_WITH_MUSA +#include +#endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #include "paddle/phi/core/cuda_stream.h" @@ -143,6 +146,8 @@ class CudaEvent { CudaEvent() { #ifdef PADDLE_WITH_HIP hipEventCreateWithFlags(&event_, flags_); +#elif defined(PADDLE_WITH_MUSA) + musaEventCreateWithFlags(&event_, flags_); #else cudaEventCreateWithFlags(&event_, flags_); #endif @@ -152,6 +157,8 @@ class CudaEvent { explicit CudaEvent(unsigned int flags) : flags_(flags) { #ifdef PADDLE_WITH_HIP hipEventCreateWithFlags(&event_, flags_); +#elif defined(PADDLE_WITH_MUSA) + musaEventCreateWithFlags(&event_, flags_); #else cudaEventCreateWithFlags(&event_, flags_); #endif @@ -161,6 +168,8 @@ class CudaEvent { ~CudaEvent() { #ifdef PADDLE_WITH_HIP hipEventDestroy(event_); +#elif defined(PADDLE_WITH_MUSA) + musaEventDestroy(event_); #else cudaEventDestroy(event_); #endif @@ -169,6 +178,8 @@ class CudaEvent { void Record(gpuStream_t stream) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, stream)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event_, stream)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, stream)); #endif @@ -183,6 +194,14 @@ class CudaEvent { if (err == hipErrorNotReady) { return false; } +#elif defined(PADDLE_WITH_MUSA) + gpuError_t err = musaEventQuery(event_); + if (err == musaSuccess) { + return true; + } + if (err == musaErrorNotReady) { + return false; + } #else gpuError_t err = cudaEventQuery(event_); if (err == cudaSuccess) { @@ -199,6 +218,8 @@ class CudaEvent { void Synchronize() { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventSynchronize(event_)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaEventSynchronize(event_)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventSynchronize(event_)); #endif @@ -208,6 +229,8 @@ class CudaEvent { private: #ifdef PADDLE_WITH_HIP unsigned int flags_ = hipEventDefault; +#elif defined(PADDLE_WITH_MUSA) + unsigned int flags_ = musaEventDefault; #else unsigned int flags_ = cudaEventDefault; #endif diff --git a/paddle/phi/backends/device_code.cc b/paddle/phi/backends/device_code.cc index 27cdf09236d35..529e42fc4c95b 100644 --- a/paddle/phi/backends/device_code.cc +++ b/paddle/phi/backends/device_code.cc @@ -101,6 +101,13 @@ static bool CheckCUDADriverResult(hipError_t result, if (result != hipSuccess) { const char* error = nullptr; error = dynload::hipGetErrorString(result); +#elif defined(PADDLE_WITH_MUSA) +static bool CheckCUDADriverResult(MUresult result, + std::string caller, + std::string kernel_name = "") { + if (result != MUSA_SUCCESS) { + const char* error = nullptr; + dynload::muGetErrorString(result, &error); #else static bool CheckCUDADriverResult(CUresult result, std::string caller, @@ -130,6 +137,8 @@ void GPUDeviceCode::CheckAvailableStatus() { #ifdef PADDLE_WITH_HIP hiprtcResult nvrtc_result = dynload::hiprtcVersion(&nvrtc_major, &nvrtc_minor); +#elif defined(PADDLE_WITH_MUSA) + nvrtcResult nvrtc_result = dynload::nvrtcVersion(&nvrtc_major, &nvrtc_minor); #else nvrtcResult nvrtc_result = dynload::nvrtcVersion(&nvrtc_major, &nvrtc_minor); #endif @@ -140,6 +149,9 @@ void GPUDeviceCode::CheckAvailableStatus() { #ifdef PADDLE_WITH_HIP hipError_t driver_result = dynload::hipDriverGetVersion(&driver_version); if (driver_result == hipSuccess) { +#elif defined(PADDLE_WITH_MUSA) + MUresult driver_result = dynload::muDriverGetVersion(&driver_version); + if (driver_result == MUSA_SUCCESS) { #else CUresult driver_result = dynload::cuDriverGetVersion(&driver_version); if (driver_result == CUDA_SUCCESS) { @@ -153,6 +165,8 @@ void GPUDeviceCode::CheckAvailableStatus() { << "." << nvrtc_minor; #ifdef PADDLE_WITH_HIP if (nvrtc_result != HIPRTC_SUCCESS || driver_result != hipSuccess) { +#elif defined(PADDLE_WITH_MUSA) + if (nvrtc_result != NVRTC_SUCCESS || driver_result != MUSA_SUCCESS) { #else if (nvrtc_result != NVRTC_SUCCESS || driver_result != CUDA_SUCCESS) { #endif @@ -163,6 +177,9 @@ void GPUDeviceCode::CheckAvailableStatus() { #ifdef PADDLE_WITH_HIP if (CheckCUDADriverResult(dynload::hipGetDeviceCount(&count), "hipGetDeviceCount")) { +#elif defined(PADDLE_WITH_MUSA) + if (CheckCUDADriverResult(dynload::muDeviceGetCount(&count), + "muDeviceGetCount")) { #else if (CheckCUDADriverResult(dynload::cuDeviceGetCount(&count), "cuDeviceGetCount")) { @@ -202,6 +219,8 @@ static std::string FindCUDAIncludePath() { #ifdef PADDLE_WITH_HIP cuda_include_path = "/opt/rocm/include"; +#elif defined(PADDLE_WITH_MUSA) + cuda_include_path = "/usr/local/musa/include"; #else cuda_include_path = "/usr/local/cuda/include"; #endif @@ -229,6 +248,8 @@ GPUDeviceCode::GPUDeviceCode(const Place& place, name_ = name; #ifdef PADDLE_WITH_HIP kernel_ = "#include \n" + kernel; +#elif defined(PADDLE_WITH_MUSA) + kernel_ = kernel; #else kernel_ = kernel; #endif @@ -318,6 +339,86 @@ bool GPUDeviceCode::Compile(bool include_path) { "hipModuleGetFunction")) { return false; } +#elif defined(PADDLE_WITH_MUSA) + nvrtcProgram program; + if (!CheckNVRTCResult(dynload::nvrtcCreateProgram(&program, + kernel_.c_str(), // buffer + name_.c_str(), // name + 0, // numHeaders + nullptr, // headers + nullptr), // includeNames + "nvrtcCreateProgram")) { + return false; + } + + // Compile the program for specified compute_capability + auto* dev_ctx = reinterpret_cast( + DeviceContextPool::Instance().Get(place_)); + int compute_capability = dev_ctx->GetComputeCapability(); + std::string compute_flag = + "--gpu-architecture=compute_" + std::to_string(compute_capability); + std::vector options = {"--std=c++11", compute_flag.c_str()}; + std::string include_option; + if (include_path) { + std::string cuda_include_path = FindMUSAIncludePath(); + if (!cuda_include_path.empty()) { + include_option = "--include-path=" + cuda_include_path; + options.push_back(include_option.c_str()); + } + } + nvrtcResult compile_result = + dynload::nvrtcCompileProgram(program, // program + options.size(), // numOptions + options.data()); // options + if (compile_result == NVRTC_ERROR_COMPILATION) { + // Obtain compilation log from the program + size_t log_size; + if (!CheckNVRTCResult(dynload::nvrtcGetProgramLogSize(program, &log_size), + "nvrtcGetProgramLogSize")) { + return false; + } + std::vector log; + log.resize(log_size + 1); + if (!CheckNVRTCResult(dynload::nvrtcGetProgramLog(program, log.data()), + "nvrtcGetProgramLog")) { + return false; + } + LOG(WARNING) << "JIT compiling of CUDA code failed:" + << "\n Kernel name: " << name_ << "\n Kernel body:\n" + << kernel_ << "\n Compiling log: " << log.data(); + + return false; + } + + // Obtain PTX from the program + size_t ptx_size; + if (!CheckNVRTCResult(dynload::nvrtcGetPTXSize(program, &ptx_size), + "nvrtcGetPTXSize")) { + return false; + } + ptx_.resize(ptx_size + 1); + if (!CheckNVRTCResult(dynload::nvrtcGetPTX(program, ptx_.data()), + "nvrtcGetPTX")) { + return false; + } + + if (!CheckNVRTCResult(dynload::nvrtcDestroyProgram(&program), + "nvrtcDestroyProgram")) { + return false; + } + + if (!CheckCUDADriverResult(dynload::muModuleLoadData(&module_, ptx_.data()), + "muModuleLoadData", + name_)) { + return false; + } + + if (!CheckCUDADriverResult( + dynload::muModuleGetFunction(&function_, module_, name_.c_str()), + "muModuleGetFunction", + name_)) { + return false; + } #else nvrtcProgram program; if (!CheckNVRTCResult(dynload::nvrtcCreateProgram(&program, @@ -436,6 +537,22 @@ void GPUDeviceCode::Launch(const size_t n, std::vector* args) const { hipSuccess, errors::External("Fail to launch kernel %s (in hipModuleLaunchKernel.)", name_.c_str())); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_EQ( + dynload::muLaunchKernel(function_, + num_blocks, + 1, + 1, // grid dim + num_threads_, + 1, + 1, // block dim + 0, // shared memory + dev_ctx->stream(), // stream + args->data(), // arguments + nullptr), + MUSA_SUCCESS, + errors::External("Fail to launch kernel %s (in muLaunchKernel.)", + name_.c_str())); #else PADDLE_ENFORCE_EQ( dynload::cuLaunchKernel(function_, @@ -464,6 +581,14 @@ bool GPUDeviceCode::CheckNVRTCResult(hiprtcResult result, << " > failed: " << dynload::hiprtcGetErrorString(result); return false; } +#elif defined(PADDLE_WITH_MUSA) +bool GPUDeviceCode::CheckNVRTCResult(nvrtcResult result, std::string function) { + if (result != NVRTC_SUCCESS) { + LOG_FIRST_N(WARNING, 1) + << "Call " << function << " for < " << name_ + << " > failed: " << dynload::nvrtcGetErrorString(result); + return false; + } #else bool GPUDeviceCode::CheckNVRTCResult(nvrtcResult result, std::string function) { if (result != NVRTC_SUCCESS) { diff --git a/paddle/phi/backends/device_code.h b/paddle/phi/backends/device_code.h index 64b89b83b42ed..63d221ea8c89a 100644 --- a/paddle/phi/backends/device_code.h +++ b/paddle/phi/backends/device_code.h @@ -26,6 +26,10 @@ limitations under the License. */ #include "paddle/phi/backends/dynload/cuda_driver.h" #include "paddle/phi/backends/dynload/nvrtc.h" #endif +#ifdef PADDLE_WITH_MUSA +#include "paddle/phi/backends/dynload/musa_driver.h" +#include "paddle/phi/backends/dynload/nvrtc.h" +#endif #ifdef PADDLE_WITH_HIP #include "paddle/phi/backends/dynload/hiprtc.h" #include "paddle/phi/backends/dynload/rocm_driver.h" @@ -68,6 +72,8 @@ class GPUDeviceCode : public DeviceCode { private: #ifdef PADDLE_WITH_HIP bool CheckNVRTCResult(hiprtcResult result, std::string function); +#elif defined(PADDLE_WITH_MUSA) + bool CheckNVRTCResult(cudartcResult result, std::string function); #else bool CheckNVRTCResult(nvrtcResult result, std::string function); #endif @@ -82,6 +88,9 @@ class GPUDeviceCode : public DeviceCode { #ifdef PADDLE_WITH_HIP hipModule_t module_; hipFunction_t function_; +#elif defined(PADDLE_WITH_MUSA) + MUmodule module_; + MUfunction function_; #else CUmodule module_; CUfunction function_; diff --git a/paddle/phi/backends/dynload/dynamic_loader.cc b/paddle/phi/backends/dynload/dynamic_loader.cc index 354ff5b7dc855..fd6d3ef9e0097 100644 --- a/paddle/phi/backends/dynload/dynamic_loader.cc +++ b/paddle/phi/backends/dynload/dynamic_loader.cc @@ -506,6 +506,11 @@ void* GetNCCLDsoHandle() { "You may need to install 'rccl' from ROCM official website: " "https://rocmdocs.amd.com/en/latest/Installation_Guide/" "Installation-Guide.html before install PaddlePaddle."); +#elif defined(PADDLE_WITH_MUSA) + std::string warning_msg( + "You may need to install 'mccl' from MUSA official website: " + "https://rocmdocs.amd.com/en/latest/Installation_Guide/" + "Installation-Guide.html before install PaddlePaddle."); #else std::string warning_msg( "You may need to install 'nccl2' from NVIDIA official website: " diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc index f10ec7019b7b6..9291f3d00d8f3 100644 --- a/paddle/phi/backends/gpu/gpu_context.cc +++ b/paddle/phi/backends/gpu/gpu_context.cc @@ -43,6 +43,17 @@ limitations under the License. */ #endif // !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) #endif // PADDLE_WITH_CUDA +#ifdef PADDLE_WITH_MUSA +#include "paddle/phi/backends/dynload/mublas.h" +#include "paddle/phi/backends/dynload/mudnn.h" +#include "paddle/phi/backends/dynload/musolver.h" +#include "paddle/phi/backends/dynload/musparse.h" +#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) +#include "paddle/phi/backends/dynload/mccl.h" +#endif // !defined(__APPLE__) && defined(PADDLE_WITH_MCCL) +#endif // PADDLE_WITH_MUSA + + #ifdef PADDLE_WITH_HIP #include "paddle/phi/backends/dynload/miopen.h" #include "paddle/phi/backends/dynload/rocblas.h" @@ -119,6 +130,9 @@ class EigenGpuStreamDevice : public Eigen::StreamInterface { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream())); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream())); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream())); @@ -143,6 +157,16 @@ static void StreamCallbackFunc(gpuStream_t stream, gpuError_t status, void* user_data) #endif + +#ifdef PADDLE_WITH_MUSA +#if MUSA_VERSION >= 10000 + static void MUDART_CB StreamCallbackFunc(void* user_data) +#else + static void MUDART_CB + StreamCallbackFunc(musaStream_t stream, musaError_t status, void* user_data) +#endif +#endif + #ifdef PADDLE_WITH_CUDA #if CUDA_VERSION >= 10000 static void CUDART_CB StreamCallbackFunc(void* user_data) @@ -170,6 +194,8 @@ void DnnWorkspaceHandle::RunFuncSync( std::lock_guard guard(*mtx_); #ifdef PADDLE_WITH_HIP auto status = hipMalloc(&workspace_ptr, size); +#elif defined(PADDLE_WITH_MUSA) + auto status = musaMalloc(&workspace_ptr, size); #else auto status = cudaMalloc(&workspace_ptr, size); #endif @@ -178,6 +204,8 @@ void DnnWorkspaceHandle::RunFuncSync( phi::backends::gpu::GpuStreamSync(stream_); #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipFree(workspace_ptr)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaFree(workspace_ptr)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaFree(workspace_ptr)); #endif @@ -464,6 +492,11 @@ struct GPUContext::Impl { PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDestroy(dnn_handle_)); dnn_handle_ = nullptr; } +#elif defined(PADDLE_WITH_MUSA) + if (owned_ && dnn_handle_ != nullptr) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnDestroy(dnn_handle_)); + dnn_handle_ = nullptr; + } #else if (owned_ && dnn_handle_ != nullptr) { PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDestroy(dnn_handle_)); @@ -529,6 +562,18 @@ struct GPUContext::Impl { break; } #endif // !defined(_WIN32) + +#elif defined(PADDLE_WITH_MUSA) + musaError_t e_sync = musaSuccess; +#if !defined(_WIN32) + e_sync = musaStreamSynchronize(stream()); +#else + while (e_sync = musaStreamQuery(stream())) { + if (e_sync == musaErrorNotReady) continue; + break; + } +#endif // !defined(_WIN32) + #else // PADDLE_WITH_HIP cudaError_t e_sync = cudaSuccess; #if !defined(_WIN32) @@ -547,6 +592,8 @@ struct GPUContext::Impl { void WaitEvent(gpuEvent_t ev) const { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(stream(), ev, 0)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(stream(), ev, 0)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(stream(), ev, 0)); #endif @@ -678,6 +725,8 @@ struct GPUContext::Impl { void RecordEvent(gpuEvent_t ev) const { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(ev, stream())); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(ev, stream())); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(ev, stream())); #endif @@ -708,6 +757,16 @@ struct GPUContext::Impl { PADDLE_ENFORCE_GPU_SUCCESS( cudaStreamAddCallback(stream(), internal::StreamCallbackFunc, func, 0)); #endif +#endif + +#ifdef PADDLE_WITH_MUSA +#if MUSA_VERSION >= 10000 + PADDLE_ENFORCE_GPU_SUCCESS( + musaLaunchHostFunc(stream(), internal::StreamCallbackFunc, func)); +#else + PADDLE_ENFORCE_GPU_SUCCESS( + musaStreamAddCallback(stream(), internal::StreamCallbackFunc, func, 0)); +#endif #endif } diff --git a/paddle/phi/backends/gpu/gpu_decls.h b/paddle/phi/backends/gpu/gpu_decls.h index 4a6b9d2fd87f1..93dba9764478a 100644 --- a/paddle/phi/backends/gpu/gpu_decls.h +++ b/paddle/phi/backends/gpu/gpu_decls.h @@ -23,6 +23,10 @@ namespace phi { #define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \ using GPU_TYPE = ROCM_TYPE; +#elif defined(PADDLE_WITH_MUSA) + +#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \ + using GPU_TYPE = MUSA_TYPE; #else // PADDLE_WITH_CDUA #define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \ diff --git a/paddle/phi/backends/gpu/gpu_device_function.h b/paddle/phi/backends/gpu/gpu_device_function.h index de4565cb6e7ce..5c0c475b140ff 100644 --- a/paddle/phi/backends/gpu/gpu_device_function.h +++ b/paddle/phi/backends/gpu/gpu_device_function.h @@ -17,6 +17,8 @@ limitations under the License. */ #ifdef PADDLE_WITH_HIP #include "paddle/phi/backends/gpu/rocm/rocm_device_function.h" +#elif defined(PADDLE_WITH_MUSA) +#include "paddle/phi/backends/gpu/musa/musa_device_function.h" #else #include "paddle/phi/backends/gpu/cuda/cuda_device_function.h" #endif diff --git a/paddle/phi/backends/gpu/gpu_dnn.h b/paddle/phi/backends/gpu/gpu_dnn.h index 44163d8048f2c..30cf3fae80519 100644 --- a/paddle/phi/backends/gpu/gpu_dnn.h +++ b/paddle/phi/backends/gpu/gpu_dnn.h @@ -19,6 +19,9 @@ #ifdef PADDLE_WITH_HIP #include "paddle/phi/backends/gpu/rocm/miopen_desc.h" #include "paddle/phi/backends/gpu/rocm/miopen_helper.h" +#elif defined(PADDLE_WITH_MUSA) +#include "paddle/phi/backends/gpu/musa/mudnn_desc.h" +#include "paddle/phi/backends/gpu/musa/mudnn_helper.h" #else // CUDA #include "paddle/phi/backends/gpu/cuda/cudnn_desc.h" #include "paddle/phi/backends/gpu/cuda/cudnn_helper.h" diff --git a/paddle/phi/backends/gpu/gpu_helper.h b/paddle/phi/backends/gpu/gpu_helper.h index 428c5dcb96c6a..8afa826408cb7 100644 --- a/paddle/phi/backends/gpu/gpu_helper.h +++ b/paddle/phi/backends/gpu/gpu_helper.h @@ -17,6 +17,8 @@ #ifdef PADDLE_WITH_HIP #include "paddle/phi/backends/gpu/rocm/rocm_helper.h" +#elif defined(PADDLE_WITH_MUSA) +#include "paddle/phi/backends/gpu/musa/musa_helper.h" #else #include "paddle/phi/backends/gpu/cuda/cuda_helper.h" #endif diff --git a/paddle/phi/backends/gpu/gpu_primitives.h b/paddle/phi/backends/gpu/gpu_primitives.h index a77527c081650..b7c9f9c4688dc 100644 --- a/paddle/phi/backends/gpu/gpu_primitives.h +++ b/paddle/phi/backends/gpu/gpu_primitives.h @@ -16,6 +16,9 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUDA #include #endif +#ifdef PADDLE_WITH_MUSA +#include +#endif #ifdef PADDLE_WITH_HIP #include #endif diff --git a/paddle/phi/backends/gpu/gpu_resources.cc b/paddle/phi/backends/gpu/gpu_resources.cc index a447df94cb4dc..b60d0cccd3dc5 100644 --- a/paddle/phi/backends/gpu/gpu_resources.cc +++ b/paddle/phi/backends/gpu/gpu_resources.cc @@ -33,6 +33,19 @@ #endif // !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) #endif // PADDLE_WITH_CUDA + + +#ifdef PADDLE_WITH_MUSA +#include "paddle/phi/backends/dynload/mublas.h" +#include "paddle/phi/backends/dynload/mublasLt.h" +#include "paddle/phi/backends/dynload/mudnn.h" +#include "paddle/phi/backends/dynload/musolver.h" +#include "paddle/phi/backends/dynload/musparse.h" +#if !defined(__APPLE__) && defined(PADDLE_WITH_MCCL) +#include "paddle/phi/backends/dynload/mccl.h" +#endif // !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) +#endif // PADDLE_WITH_MUSA + #ifdef PADDLE_WITH_HIP #include "paddle/phi/backends/dynload/rocsparse.h" #endif @@ -144,6 +157,44 @@ void InitGpuProperties(Place place, << "Please recompile or reinstall Paddle with compatible MIOPEN " "version."; } +#elif defined(PADDLE_WITH_MUSA) + size_t mudnn_dso_ver = dynload::mudnnGetVersion(); + LOG_FIRST_N(WARNING, 1) << "device: " << static_cast(place.device) + << ", muDNN Version: " << mudnn_dso_ver / 1000 << "." + << (mudnn_dso_ver % 1000) / 100 << "."; + + // Check MUSA/MUDNN version compatiblity + auto local_musa_version = + (*driver_version / 1000) * 10 + (*driver_version % 100) / 10; + auto compile_musa_version = + (MUSA_VERSION / 1000) * 10 + (MUSA_VERSION % 100) / 10; +#if defined(__linux__) + PADDLE_ENFORCE_EQ( + (local_musa_version / 10 < compile_musa_version / 10) && + (mudnn_dso_ver / 1000 < MUDNN_VERSION / 1000), + false, + phi::errors::InvalidArgument( + "The installed Paddle is compiled with MUDA%d/muDNN%d," + "but MUSA/muDNN version in your machine is MUSA%d/muDNN%d. " + "which will cause serious incompatible bug. " + "Please recompile or reinstall Paddle with compatible MUSA/muDNN " + "version.", + compile_musa_version / 10, + MUDNN_VERSION / 1000, + local_musa_version / 10, + mudnn_dso_ver / 1000)); +#endif + if (local_cuda_version < compile_cuda_version) { + LOG_FIRST_N(WARNING, 1) + << "WARNING: device: " << static_cast(place.device) + << ". The installed Paddle is compiled with CUDA " + << compile_cuda_version / 10 << "." << compile_cuda_version % 10 + << ", but CUDA runtime version in your machine is " + << local_cuda_version / 10 << "." << local_cuda_version % 10 + << ", which may cause serious incompatible bug. " + << "Please recompile or reinstall Paddle with compatible CUDA " + "version."; + } #else size_t cudnn_dso_ver = dynload::cudnnGetVersion(); LOG_FIRST_N(WARNING, 1) << "device: " << static_cast(place.device) @@ -189,6 +240,9 @@ void InitStream(gpuStream_t* stream) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipStreamCreateWithPriority(stream, hipStreamDefault, 0)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaStreamCreateWithPriority(stream, musaStreamDefault, 0)); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaStreamCreateWithPriority(stream, cudaStreamDefault, 0)); @@ -199,6 +253,8 @@ void DestoryStream(gpuStream_t stream) { if (stream != nullptr) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(stream)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamDestroy(stream)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(stream)); #endif @@ -210,7 +266,11 @@ void InitBlasHandle(blasHandle_t* blas_handle, gpuStream_t stream) { #ifdef PADDLE_WITH_HIP phi::dynload::rocblas_create_handle(blas_handle); phi::dynload::rocblas_set_stream(*blas_handle, stream); -#else // PADDLE_WITH_CUDA +#elif defined(PADDLE_WITH_MUSA) + PADDLE_RETRY_MUSA_SUCCESS(phi::dynload::mublasCreate(blas_handle)); + PADDLE_RETRY_MUSA_SUCCESS( + phi::dynload::mublasSetStream(*blas_handle, stream)); +#else // PADDLE_WITH_MUSA PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasCreate(blas_handle)); PADDLE_RETRY_CUDA_SUCCESS( phi::dynload::cublasSetStream(*blas_handle, stream)); @@ -223,6 +283,11 @@ void DestroyBlasHandle(blasHandle_t handle) { phi::dynload::rocblas_destroy_handle(handle); handle = nullptr; } +#elif defined(PADDLE_WITH_MUSA) + if (handle != nullptr) { + phi::dynload::mublasDestroy(handle); + handle = nullptr; + } #else if (handle != nullptr) { phi::dynload::cublasDestroy(handle); @@ -268,6 +333,22 @@ void InitDnnHandle(dnnHandle_t* handle, gpuStream_t stream, Place place) { } PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreate(handle)); PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetStream(*handle, stream)); +#elif defined(PADDLE_WITH_MUSA) + auto local_cudnn_version = phi::dynload::mudnnGetVersion() / 100; + auto compile_mudnn_version = MUDNN_VERSION / 100; + if (local_mudnn_version < static_cast(compile_mudnn_version)) { + LOG_FIRST_N(WARNING, 1) + << "WARNING: device: " << place.device + << ". The installed Paddle is compiled with MUDNN " + << compile_mudnn_version / 10 << "." << compile_mudnn_version % 10 + << ", but MUDNN version in your machine is " + << local_mudnn_version / 10 << "." << local_mudnn_version % 10 + << ", which may cause serious incompatible bug. " + << "Please recompile or reinstall Paddle with compatible MUDNN " + "version."; + } + PADDLE_RETRY_MUSA_SUCCESS(phi::dynload::mudnnCreate(handle)); + PADDLE_RETRY_MUSA_SUCCESS(phi::dynload::mudnnSetStream(*handle, stream)); #else auto local_cudnn_version = phi::dynload::cudnnGetVersion() / 100; auto compile_cudnn_version = CUDNN_VERSION / 100; @@ -296,6 +377,11 @@ void DestroyDnnHandle(dnnHandle_t handle) { PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDestroy(handle)); handle = nullptr; } +#elif defined(PADDLE_WITH_MUSA) + if (handle != nullptr) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnDestroy(handle)); + handle = nullptr; + } #else if (handle != nullptr) { PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDestroy(handle)); diff --git a/paddle/phi/backends/gpu/gpu_types.h b/paddle/phi/backends/gpu/gpu_types.h index effab17059ac4..36e094f4a0814 100644 --- a/paddle/phi/backends/gpu/gpu_types.h +++ b/paddle/phi/backends/gpu/gpu_types.h @@ -22,6 +22,9 @@ #ifdef PADDLE_WITH_HIP #include "paddle/phi/backends/dynload/miopen.h" #include "paddle/phi/backends/dynload/rocblas.h" +#elif defined(PADDLE_WITH_MUSA) +#include "paddle/phi/backends/dynload/mublas.h" +#include "paddle/phi/backends/dynload/mudnn.h" #else // PADDLE_WITH_CUDA #include "paddle/phi/backends/dynload/cublas.h" #include "paddle/phi/backends/dynload/cudnn.h" @@ -32,7 +35,9 @@ namespace phi { #ifdef PADDLE_WITH_HIP #define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \ using GPU_TYPE = ROCM_TYPE; - +#elif defined(PADDLE_WITH_MUSA) +#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \ + using GPU_TYPE = MUSA_TYPE; #else // PADDLE_WITH_CDUA #define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \ @@ -56,6 +61,9 @@ DECLARE_TYPE_FOR_GPU(dnnActivationMode_t, #ifdef PADDLE_WITH_HIP #define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \ constexpr auto GPU_CV = ROCM_CV; +#elif defined(PADDLE_WITH_MUSA) +#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \ + constexpr auto GPU_CV = MUSA_CV; #else // PADDLE_WITH_CUDA #define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \ constexpr auto GPU_CV = CUDA_CV; diff --git a/paddle/phi/backends/gpu/rocm/rocm_device_function.h b/paddle/phi/backends/gpu/rocm/rocm_device_function.h index 6f5d684075f0f..0785ba2dd1cdb 100644 --- a/paddle/phi/backends/gpu/rocm/rocm_device_function.h +++ b/paddle/phi/backends/gpu/rocm/rocm_device_function.h @@ -132,6 +132,8 @@ __device__ T reduceSum(T val, int tid, int len) { // but most card's warp size is 32. #ifdef PADDLE_WITH_HIP const int warpSize = 64; +#elif defined(PADDLE_WITH_MUSA) + const int warpSize = 32; #else const int warpSize = 32; #endif diff --git a/paddle/phi/common/bfloat16.h b/paddle/phi/common/bfloat16.h index 7ea9b0cbb6477..4cc21a14faac8 100644 --- a/paddle/phi/common/bfloat16.h +++ b/paddle/phi/common/bfloat16.h @@ -61,6 +61,13 @@ struct PADDLE_ALIGN(2) bfloat16 { tempRes = reinterpret_cast(&val); res = *tempRes; x = res >> 16; +#elif defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_MUSA_BF16) + __nv_bfloat16 tmp = __float2bfloat16(val); + x = *reinterpret_cast(&tmp); +#else + std::memcpy(&x, reinterpret_cast(&val) + 2, 2); +#endif #else #if defined(PADDLE_CUDA_BF16) __nv_bfloat16 tmp = __float2bfloat16(val); @@ -154,6 +161,16 @@ struct PADDLE_ALIGN(2) bfloat16 { uint16_t* temp_ptr = reinterpret_cast(&temp); res = *temp_ptr; return res; +#elif defined(PADDLE_WITH_MUSA) +#ifdef PADDLE_MUSA_BF16 + return __bfloat162float(*reinterpret_cast(&x)); +#else + float val = 0.f; + uint16_t temp = x; + std::memcpy( + reinterpret_cast(&val) + 2, reinterpret_cast(&temp), 2); + return val; +#endif #else #ifdef PADDLE_CUDA_BF16 return __bfloat162float(*reinterpret_cast(&x)); diff --git a/paddle/phi/common/complex.h b/paddle/phi/common/complex.h index 6df324c5ead11..43e513146ba0a 100644 --- a/paddle/phi/common/complex.h +++ b/paddle/phi/common/complex.h @@ -26,6 +26,11 @@ #include #endif // PADDLE_WITH_CUDA +#ifdef PADDLE_WITH_MUSA +#include +#include +#endif // PADDLE_WITH_MUSA + #ifdef PADDLE_WITH_HIP #include #include // NOLINT @@ -83,6 +88,15 @@ struct PADDLE_ALIGN(sizeof(T) * 2) complex { HOSTDEVICE inline explicit operator hipDoubleComplex() const { return make_hipDoubleComplex(real, imag); } + +#elif defined(PADDLE_WITH_MUSA) + HOSTDEVICE inline explicit operator muFloatComplex() const { + return make_muFloatComplex(real, imag); + } + + HOSTDEVICE inline explicit operator muDoubleComplex() const { + return make_muDoubleComplex(real, imag); + } #else HOSTDEVICE inline explicit operator cuFloatComplex() const { return make_cuFloatComplex(real, imag); diff --git a/paddle/phi/common/float16.h b/paddle/phi/common/float16.h index 094fc5681c04e..572f460197f08 100644 --- a/paddle/phi/common/float16.h +++ b/paddle/phi/common/float16.h @@ -37,6 +37,10 @@ #include #endif // PADDLE_WITH_CUDA +#ifdef PADDLE_WITH_MUSA +#include +#endif // PADDLE_WITH_MUSA + #ifdef PADDLE_WITH_HIP #include #endif diff --git a/paddle/phi/core/cuda_stream.h b/paddle/phi/core/cuda_stream.h index 58f08a2a36b57..87ab5e23818fb 100644 --- a/paddle/phi/core/cuda_stream.h +++ b/paddle/phi/core/cuda_stream.h @@ -28,6 +28,11 @@ using gpuStream_t = cudaStream_t; using gpuStream_t = hipStream_t; #endif +#ifdef PADDLE_WITH_CUDA +#include +using gpuStream_t = cudaStream_t; +#endif + #ifdef PADDLE_WITH_MUSA #include using gpuStream_t = musaStream_t; @@ -152,6 +157,8 @@ class CUDAStream { void WaitEvent(gpuEvent_t ev) const { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(raw_stream(), ev, 0)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(raw_stream(), ev, 0)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(raw_stream(), ev, 0)); #endif @@ -164,6 +171,8 @@ class CUDAStream { backends::gpu::GPUDeviceGuard guard(place_.device); #ifdef PADDLE_WITH_HIP hipStreamDestroy(raw_stream()); +#elif defined(PADDLE_WITH_MUSA) + musaStreamDestroy(raw_stream()); #else cudaStreamDestroy(raw_stream()); #endif diff --git a/paddle/phi/core/enforce.h b/paddle/phi/core/enforce.h index d4ae30598551c..cda5a3a49c528 100644 --- a/paddle/phi/core/enforce.h +++ b/paddle/phi/core/enforce.h @@ -35,6 +35,16 @@ limitations under the License. */ #include #endif // PADDLE_WITH_CUDA +#ifdef PADDLE_WITH_MUSA +#include +#include +#include +#include +#include +#include +#include +#endif // PADDLE_WITH_MUSA + #ifdef PADDLE_WITH_HIP #include #include @@ -75,6 +85,20 @@ limitations under the License. */ #endif // __APPLE__ #endif // PADDLE_WITH_CUDA + +#ifdef PADDLE_WITH_MUSA +#include "paddle/phi/backends/dynload/mublas.h" +#include "paddle/phi/backends/dynload/mudnn.h" +#include "paddle/phi/backends/dynload/murand.h" +#include "paddle/phi/backends/dynload/musolver.h" +#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) +#include + +#include "paddle/phi/backends/dynload/mccl.h" +#endif // __APPLE__ +#endif // PADDLE_WITH_MUSA + + #ifdef PADDLE_WITH_HIP #include "paddle/phi/backends/dynload/hipfft.h" #include "paddle/phi/backends/dynload/hiprand.h" diff --git a/paddle/phi/core/string_tensor.cc b/paddle/phi/core/string_tensor.cc index 98ad70622b943..1d95e16e2d9cc 100644 --- a/paddle/phi/core/string_tensor.cc +++ b/paddle/phi/core/string_tensor.cc @@ -117,6 +117,8 @@ void StringTensor::init_holder() { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #ifdef PADDLE_WITH_HIP hipMemset(ptr, 0, bytes_size); +#elif defined(PADDLE_WITH_MUSA) + musaMemset(ptr, 0, bytes_size); #else cudaMemset(ptr, 0, bytes_size); #endif From be2dfd9841a22212a5fe0d6e717aa155dc7717df Mon Sep 17 00:00:00 2001 From: Xiaokang Shang Date: Tue, 25 Jul 2023 06:57:57 +0000 Subject: [PATCH 03/55] add musa device context and allocator --- .../allocation/naive_best_fit_allocator.cc | 2 +- paddle/fluid/platform/collective_helper.cc | 2 + paddle/fluid/platform/device/gpu/gpu_info.cc | 14 +- paddle/fluid/platform/device/gpu/gpu_types.h | 39 +- paddle/phi/backends/gpu/gpu_context.cc | 4 + paddle/phi/backends/gpu/musa/miopen_desc.h | 264 ++++++++ paddle/phi/backends/gpu/musa/miopen_helper.h | 595 ++++++++++++++++++ .../backends/gpu/musa/rocm_device_function.h | 165 +++++ paddle/phi/backends/gpu/musa/rocm_helper.h | 74 +++ paddle/phi/backends/gpu/musa/rocm_info.cc | 334 ++++++++++ 10 files changed, 1473 insertions(+), 20 deletions(-) create mode 100644 paddle/phi/backends/gpu/musa/miopen_desc.h create mode 100644 paddle/phi/backends/gpu/musa/miopen_helper.h create mode 100644 paddle/phi/backends/gpu/musa/rocm_device_function.h create mode 100644 paddle/phi/backends/gpu/musa/rocm_helper.h create mode 100644 paddle/phi/backends/gpu/musa/rocm_info.cc diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc index 27a6e3857f224..a7af040f86c5f 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc @@ -283,7 +283,7 @@ BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) { template <> size_t Used(const platform::CUDAPlace &place) { -#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP) +#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || defined PADDLE_WITH_MUSA) return GetGPUBuddyAllocator(place.device)->Used(); #else PADDLE_THROW(platform::errors::PermissionDenied( diff --git a/paddle/fluid/platform/collective_helper.cc b/paddle/fluid/platform/collective_helper.cc index b133a57d523ac..a6c2b9d61dd2b 100644 --- a/paddle/fluid/platform/collective_helper.cc +++ b/paddle/fluid/platform/collective_helper.cc @@ -174,6 +174,8 @@ void NCCLCommContext::CreateNCCLCommMultiTrainer( for (int i = 0; i < kDevices; i++) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipSetDevice(i)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaSetDevice(i)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaSetDevice(i)); #endif diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc index 94c85105115d6..73fe0ca05ba73 100644 --- a/paddle/fluid/platform/device/gpu/gpu_info.cc +++ b/paddle/fluid/platform/device/gpu/gpu_info.cc @@ -35,6 +35,8 @@ limitations under the License. */ #ifdef PADDLE_WITH_HIP #include "paddle/fluid/platform/dynload/miopen.h" +#elif defined(PADDLE_WITH_MUSA) +//TODO(Xiaokang Shang) #else #include "paddle/fluid/platform/dynload/cudnn.h" #include "paddle/phi/backends/gpu/cuda/cuda_graph.h" @@ -217,7 +219,11 @@ class RecordedGpuMallocHelper { result = hipMalloc(ptr, size); } #elif defined(PADDLE_WITH_MUSA) - result = musaMalloc(ptr, size); + if (UNLIKELY(malloc_managed_memory)) { + result = musaMallocManaged(ptr, size); + } else { + result = musaMalloc(ptr, size); + } #else phi::backends::gpu::CUDAGraphCaptureModeGuard capture_mode_guard; if (UNLIKELY(malloc_managed_memory)) { @@ -264,9 +270,9 @@ class RecordedGpuMallocHelper { #ifdef PADDLE_WITH_HIP auto err = hipFree(ptr); if (err != hipErrorDeinitialized) { -#elif define(PADDLE_WITH_MUSA) +#elif defined(PADDLE_WITH_MUSA) auto err = musaFree(ptr); - if (err != musaErrorMusaUnloading) { + if (err != musaErrorInvalidValue) { #else auto err = cudaFree(ptr); VLOG(10) << "[cudaFree] size=" << static_cast(size) / (1 << 20) @@ -314,7 +320,7 @@ class RecordedGpuMallocHelper { CUDADeviceGuard guard(dev_id_); #ifdef PADDLE_WITH_HIP auto result = hipMemGetInfo(actual_avail, actual_total); -#elif define(PADDLE_WITH_MUSA) +#elif defined(PADDLE_WITH_MUSA) auto result = musaMemGetInfo(actual_avail, actual_total); #else auto result = cudaMemGetInfo(actual_avail, actual_total); diff --git a/paddle/fluid/platform/device/gpu/gpu_types.h b/paddle/fluid/platform/device/gpu/gpu_types.h index 83497a2507005..b3d4c7071c216 100644 --- a/paddle/fluid/platform/device/gpu/gpu_types.h +++ b/paddle/fluid/platform/device/gpu/gpu_types.h @@ -23,6 +23,9 @@ #include "paddle/fluid/platform/dynload/miopen.h" #include "paddle/fluid/platform/dynload/rocblas.h" +#elif defined(PADDLE_WITH_MUSA) +#include +//TODO(Xiaokang Shang) #else #include @@ -34,24 +37,30 @@ namespace paddle { #ifdef PADDLE_WITH_HIP -#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \ +#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \ using GPU_TYPE = ROCM_TYPE; + +#elif defined(PADDLE_WITH_MUSA) +#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \ + using GPU_TYPE = MUSA_TYPE; #else // CDUA -#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \ +#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \ using GPU_TYPE = CUDA_TYPE; #endif -DECLARE_TYPE_FOR_GPU(gpuStream_t, cudaStream_t, hipStream_t); -DECLARE_TYPE_FOR_GPU(gpuError_t, cudaError_t, hipError_t); -DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t); -DECLARE_TYPE_FOR_GPU(gpuMemcpyKind, cudaMemcpyKind, hipMemcpyKind); -DECLARE_TYPE_FOR_GPU(gpuDeviceProp, cudaDeviceProp, hipDeviceProp_t); +DECLARE_TYPE_FOR_GPU(gpuStream_t, cudaStream_t, hipStream_t, musaStream_t); +DECLARE_TYPE_FOR_GPU(gpuError_t, cudaError_t, hipError_t, musaError_t); +DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t, musaEvent_T); +DECLARE_TYPE_FOR_GPU(gpuMemcpyKind, cudaMemcpyKind, hipMemcpyKind, musaMemcpyKind); +DECLARE_TYPE_FOR_GPU(gpuDeviceProp, cudaDeviceProp, hipDeviceProp_t, musaDeviceProp_t); -DECLARE_TYPE_FOR_GPU(dnnDataType_t, cudnnDataType_t, miopenDataType_t); +// TODO(Xiaokang Shang): confirm mudnn type +DECLARE_TYPE_FOR_GPU(dnnDataType_t, cudnnDataType_t, miopenDataType_t, mudnnDataType_t); DECLARE_TYPE_FOR_GPU(dnnActivationDescriptor, cudnnActivationStruct, - miopenActivationDescriptor); + miopenActivationDescriptor, + mudnnActivationStruct); DECLARE_TYPE_FOR_GPU(dnnActivationMode_t, cudnnActivationMode_t, miopenActivationMode_t); @@ -80,9 +89,9 @@ DECLARE_TYPE_FOR_GPU(dnnPoolingMode_t, cudnnPoolingMode_t, miopenPoolingMode_t); DECLARE_TYPE_FOR_GPU(dnnDropoutDescriptor_t, cudnnDropoutDescriptor_t, miopenDropoutDescriptor_t); -DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t); +DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t, mudnnHandle_t); -DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle); +DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle, mublasHandle_t); // TODO(Ming Huang): Since there is no blasLt handler, // use rocblas_handle for workround. @@ -93,21 +102,21 @@ using CUDAGraphID = unsigned long long; // NOLINT #undef DECLARE_TYPE_FOR_GPU #ifdef PADDLE_WITH_HIP -#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \ +#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \ constexpr auto GPU_CV = ROCM_CV; -#elif PADDLE_WITH_MUSA +#elif defined(PADDLE_WITH_MUSA) #define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \ constexpr auto GPU_CV = MUSA_CV; #else // CDUA -#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \ +#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \ constexpr auto GPU_CV = CUDA_CV; #endif DECLARE_CONSTANT_FOR_GPU(gpuErrorOutOfMemory, cudaErrorMemoryAllocation, hipErrorOutOfMemory, - musaErrorMemoryAllocation); + musaErrorOutOfMemory); DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady, cudaErrorNotReady, hipErrorNotReady, musaErrorNotReady); DECLARE_CONSTANT_FOR_GPU(gpuSuccess, cudaSuccess, hipSuccess, musaSuccess); diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc index 9291f3d00d8f3..e954c7db337aa 100644 --- a/paddle/phi/backends/gpu/gpu_context.cc +++ b/paddle/phi/backends/gpu/gpu_context.cc @@ -62,6 +62,10 @@ limitations under the License. */ #endif // !defined(__APPLE__) && defined(PADDLE_WITH_RCCL) #endif // PADDLE_WITH_HIP +#ifdef PADDLE_WITH_MUSA + +#endif + // NOTE: The paddle framework should add WITH_EIGEN option to support compile // without eigen. #include "unsupported/Eigen/CXX11/Tensor" diff --git a/paddle/phi/backends/gpu/musa/miopen_desc.h b/paddle/phi/backends/gpu/musa/miopen_desc.h new file mode 100644 index 0000000000000..ae0e274ca650e --- /dev/null +++ b/paddle/phi/backends/gpu/musa/miopen_desc.h @@ -0,0 +1,264 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/phi/backends/gpu/rocm/miopen_helper.h" +#include "paddle/phi/core/utils/data_type.h" + +namespace phi { +namespace backends { +namespace gpu { + +inline std::vector TransformDimOrder(const std::vector& dims) { + std::vector transformed_dims(dims.begin(), dims.end()); + int H, W, D, C; + if (dims.size() == 4) { + H = dims[1]; + W = dims[2]; + C = dims[3]; + transformed_dims[1] = C; + transformed_dims[2] = H; + transformed_dims[3] = W; + } else { + D = dims[1]; + H = dims[2]; + W = dims[3]; + C = dims[4]; + transformed_dims[1] = C; + transformed_dims[2] = D; + transformed_dims[3] = H; + transformed_dims[4] = W; + } + return transformed_dims; +} + +inline miopenDataType_t ToCudnnDataType(const phi::DataType& t) { + miopenDataType_t type = miopenFloat; + switch (t) { + case phi::DataType::FLOAT16: + type = miopenHalf; + break; + case phi::DataType::FLOAT32: + type = miopenFloat; + break; + default: + break; + } + return type; +} + +class ActivationDescriptor { + public: + using T = miopenActivationDescriptor; + struct Deleter { + void operator()(T* t) { + if (t != nullptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::miopenDestroyActivationDescriptor(t)); + t = nullptr; + } + } + }; + ActivationDescriptor() { + T* raw_ptr; + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::miopenCreateActivationDescriptor(&raw_ptr)); + desc_.reset(raw_ptr); + } + template + void set(miopenActivationMode_t mode, const T& coef) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetActivationDescriptor( + desc_.get(), mode, static_cast(coef), 0.0, 0.0)); + } + + T* desc() { return desc_.get(); } + T* desc() const { return desc_.get(); } + + private: + std::unique_ptr desc_; +}; + +class TensorDescriptor { + public: + using T = miopenTensorDescriptor; + struct Deleter { + void operator()(T* t) { + if (t != nullptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::miopenDestroyTensorDescriptor(t)); + t = nullptr; + } + } + }; + TensorDescriptor() { + T* raw_ptr; + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::miopenCreateTensorDescriptor(&raw_ptr)); + desc_.reset(raw_ptr); + } + T* desc() { return desc_.get(); } + T* desc() const { return desc_.get(); } + + void set(const phi::DenseTensor& tensor, const int groups = 1) { + auto dims = phi::vectorize(tensor.dims()); + std::vector strides(dims.size()); + strides[dims.size() - 1] = 1; + for (int i = dims.size() - 2; i >= 0; i--) { + strides[i] = dims[i + 1] * strides[i + 1]; + } + std::vector dims_with_group(dims.begin(), dims.end()); + if (groups > 1) { + dims_with_group[1] = dims_with_group[1] / groups; + } + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor( + (miopenTensorDescriptor_t)(desc_.get()), + ToCudnnDataType(tensor.dtype()), + static_cast(dims_with_group.size()), + const_cast(dims_with_group.data()), + const_cast(strides.data()))); + } + + void set(const phi::DenseTensor& tensor, const miopenTensorFormat_t format) { + const int groups = 1; + PADDLE_ENFORCE_EQ( + format, + MIOPEN_TENSOR_NCHW, + phi::errors::InvalidArgument("format should ONLY be NCHW in MIOPEN.")); + auto dims = phi::vectorize(tensor.dims()); + std::vector strides(dims.size()); + strides[dims.size() - 1] = 1; + for (int i = dims.size() - 2; i >= 0; i--) { + strides[i] = dims[i + 1] * strides[i + 1]; + } + std::vector dims_with_group(dims.begin(), dims.end()); + if (groups > 1) { + dims_with_group[1] = dims_with_group[1] / groups; + } + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor( + (miopenTensorDescriptor_t)(desc_.get()), + ToCudnnDataType(tensor.dtype()), + static_cast(dims_with_group.size()), + const_cast(dims_with_group.data()), + const_cast(strides.data()))); + } + + private: + std::unique_ptr desc_; +}; + +class FilterDescriptor { + public: + using T = miopenTensorDescriptor; + struct Deleter { + void operator()(T* t) { + if (t != nullptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::miopenDestroyTensorDescriptor(t)); + t = nullptr; + } + } + }; + FilterDescriptor() { + T* raw_ptr; + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::miopenCreateTensorDescriptor(&raw_ptr)); + desc_.reset(raw_ptr); + } + T* desc() { return desc_.get(); } + T* desc() const { return desc_.get(); } + + void set(const phi::DenseTensor& tensor, + const miopenTensorFormat_t format, + const int groups = 1) { + PADDLE_ENFORCE_EQ( + format, + MIOPEN_TENSOR_NCHW, + phi::errors::InvalidArgument("format should ONLY be NCHW in MIOPEN.")); + auto dims = phi::vectorize(tensor.dims()); + std::vector strides(dims.size()); + strides[dims.size() - 1] = 1; + for (int i = dims.size() - 2; i >= 0; i--) { + strides[i] = dims[i + 1] * strides[i + 1]; + } + std::vector dims_with_group(dims.begin(), dims.end()); + if (groups > 1) { + dims_with_group[1] = dims_with_group[1] / groups; + } + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor( + (miopenTensorDescriptor_t)(desc_.get()), + ToCudnnDataType(tensor.dtype()), + static_cast(dims_with_group.size()), + const_cast(dims_with_group.data()), + const_cast(strides.data()))); + } + + private: + std::unique_ptr desc_; +}; + +class ConvolutionDescriptor { + public: + using T = miopenConvolutionDescriptor; + struct Deleter { + void operator()(T* t) { + if (t != nullptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::miopenDestroyConvolutionDescriptor(t)); + t = nullptr; + } + } + }; + ConvolutionDescriptor() { + T* raw_ptr; + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::miopenCreateConvolutionDescriptor(&raw_ptr)); + desc_.reset(raw_ptr); + } + T* desc() { return desc_.get(); } + T* desc() const { return desc_.get(); } + + void set(miopenDataType_t dtype, + const std::vector& pads, + const std::vector& strides, + const std::vector& dilations, + bool allow_tf32, + const int groups = 1) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenInitConvolutionNdDescriptor( + (miopenConvolutionDescriptor_t)desc_.get(), + static_cast(pads.size()), + const_cast(pads.data()), + const_cast(strides.data()), + const_cast(dilations.data()), + miopenConvolution)); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetConvolutionGroupCount( + (miopenConvolutionDescriptor_t)desc_.get(), groups)); + } + + private: + std::unique_ptr desc_; +}; + +} // namespace gpu +} // namespace backends +} // namespace phi diff --git a/paddle/phi/backends/gpu/musa/miopen_helper.h b/paddle/phi/backends/gpu/musa/miopen_helper.h new file mode 100644 index 0000000000000..095f32ba460d0 --- /dev/null +++ b/paddle/phi/backends/gpu/musa/miopen_helper.h @@ -0,0 +1,595 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include + +#include "gflags/gflags.h" + +#include "paddle/phi/backends/dynload/miopen.h" +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/enforce.h" +#include "paddle/phi/core/errors.h" +#include "paddle/phi/core/macros.h" + +// MIOPEN do not have epslion definition +#define CUDNN_BN_MIN_EPSILON 1e-05 + +DECLARE_bool(cudnn_deterministic); + +namespace phi { +namespace backends { +namespace gpu { + +inline const char* miopenGetErrorString(miopenStatus_t status) { + switch (status) { + case miopenStatusSuccess: + return "miopenStatusSuccess"; + case miopenStatusNotInitialized: + return "miopenStatusNotInitialized"; + case miopenStatusAllocFailed: + return "miopenStatusAllocFailed"; + case miopenStatusBadParm: + return "miopenStatusBadParm"; + case miopenStatusInternalError: + return "miopenStatusInternalError"; + case miopenStatusInvalidValue: + return "miopenStatusInvalidValue"; + case miopenStatusUnknownError: + return "miopenStatusUnknownError"; + case miopenStatusNotImplemented: + return "miopenStatusNotImplemented"; + default: + return "Unknown miopen error number"; + } +} + +// no use, but will have compiling error if not defined +#define CUDNN_VERSION_MIN(major, minor, patch) \ + (CUDNN_VERSION >= ((major)*1000 + (minor)*100 + (patch))) + +enum class DataLayout { // Not use + kNHWC, + kNCHW, + kNCDHW, + kNDHWC, // add, liyamei + kNCHW_VECT_C, +}; + +enum class PoolingMode { + kMaximum, + kMaximumDeterministic, + kAverageExclusive, + kAverageInclusive, +}; + +enum class ActivationMode { + kNone, // activation identity + kSigmoid, + kRelu, + kRelu6, + kReluX, + kTanh, + kBandPass, +}; + +inline miopenPoolingMode_t GetPoolingMode(const PoolingMode& mode) { + switch (mode) { + case PoolingMode::kMaximumDeterministic: + return miopenPoolingMax; + case PoolingMode::kAverageExclusive: + return miopenPoolingAverage; + case PoolingMode::kAverageInclusive: + return miopenPoolingAverageInclusive; + case PoolingMode::kMaximum: + return miopenPoolingMax; + default: + PADDLE_THROW( + phi::errors::Unimplemented("Unexpected MIOPEN pooling mode.")); + } +} + +inline ActivationMode StringToActivationMode(const std::string& str) { + if (str == "identity") { + return ActivationMode::kNone; + } else if (str == "sigmoid") { + return ActivationMode::kSigmoid; + } else if (str == "relu") { + return ActivationMode::kRelu; + } else if (str == "relu6") { + return ActivationMode::kRelu6; + } else if (str == "relux") { + return ActivationMode::kReluX; + } else if (str == "tanh") { + return ActivationMode::kTanh; + } else if (str == "bandpass") { + return ActivationMode::kBandPass; + } else { + PADDLE_THROW(phi::errors::Unimplemented( + "Unknown MIOPEN activation string: %s.", str)); + } +} + +template +class CudnnDataType; + +template <> +class CudnnDataType { + public: + static const miopenDataType_t type = miopenHalf; + // The scaling param type is float for HALF and FLOAT tensors + using ScalingParamType = const float; + using BatchNormParamType = float; + static ScalingParamType* kOne() { + static ScalingParamType v = 1.0; + return &v; + } + static ScalingParamType* kZero() { + static ScalingParamType v = 0.0; + return &v; + } +}; + +template <> +class CudnnDataType { + public: + static const miopenDataType_t type = miopenBFloat16; + // The scaling param type is float for HALF and FLOAT tensors + using ScalingParamType = const float; + using BatchNormParamType = float; + static ScalingParamType* kOne() { + static ScalingParamType v = 1.0; + return &v; + } + static ScalingParamType* kZero() { + static ScalingParamType v = 0.0; + return &v; + } +}; + +template <> +class CudnnDataType { + public: + static const miopenDataType_t type = miopenFloat; + using ScalingParamType = const float; + using BatchNormParamType = float; + static ScalingParamType* kOne() { + static ScalingParamType v = 1.0; + return &v; + } + static ScalingParamType* kZero() { + static ScalingParamType v = 0.0; + return &v; + } +}; + +inline miopenTensorFormat_t GetCudnnTensorFormat(const DataLayout& order) { + switch (order) { + case DataLayout::kNHWC: + return MIOPEN_TENSOR_NHWC; + case DataLayout::kNCHW: + return MIOPEN_TENSOR_NCHW; + case DataLayout::kNCDHW: + return MIOPEN_TENSOR_NCHW; + case DataLayout::kNDHWC: + return MIOPEN_TENSOR_NHWC; + default: + PADDLE_THROW(phi::errors::Unimplemented( + "MIOPEN has no equivalent dataLayout for input order.")); + } + return MIOPEN_TENSOR_NCHW; +} + +class ScopedTensorDescriptor { + public: + ScopedTensorDescriptor() { + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::miopenCreateTensorDescriptor(&desc_)); + } + ~ScopedTensorDescriptor() PADDLE_MAY_THROW { + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::miopenDestroyTensorDescriptor(desc_)); + } + + inline miopenTensorDescriptor_t descriptor(const miopenTensorFormat_t format, + const miopenDataType_t type, + const std::vector& dims, + const int groups = 1) { + // the format is not used now, will add later + std::vector strides(dims.size()); + strides[dims.size() - 1] = 1; + for (int i = dims.size() - 2; i >= 0; i--) { + strides[i] = dims[i + 1] * strides[i + 1]; + } + // Update tensor descriptor dims setting if groups > 1 + // NOTE: Here, Assume using NCHW or NCDHW order + std::vector dims_with_group(dims.begin(), dims.end()); + if (groups > 1) { + dims_with_group[1] = dims_with_group[1] / groups; + } + + // MIOPEN ONLY support data layout of NCHW + PADDLE_ENFORCE_EQ( + format, + MIOPEN_TENSOR_NCHW, + phi::errors::InvalidArgument("format should ONLY be NCHW in MIOPEN.")); + if (dims.size() == 4) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor( + desc_, + type, + dims_with_group.size(), + const_cast(dims_with_group.data()), + const_cast(strides.data()))); + } else if (dims.size() == 5) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor( + desc_, + type, + dims_with_group.size(), + const_cast(dims_with_group.data()), + const_cast(strides.data()))); + } + return desc_; + } + + template + inline miopenTensorDescriptor_t descriptor(const DataLayout& order, + const std::vector& dims, + const int groups = 1) { + return descriptor( + GetCudnnTensorFormat(order), CudnnDataType::type, dims, groups); + } + + inline miopenTensorDescriptor_t descriptor(const miopenDataType_t miopen_type, + const std::vector& dim, + const std::vector& stride) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor( + desc_, + miopen_type, + dim.size(), + const_cast(dim.data()), + const_cast(stride.data()))); + return desc_; + } + + template + inline miopenTensorDescriptor_t descriptor(const std::vector& dim, + const std::vector& stride) { + return descriptor(CudnnDataType::type, dim, stride); + } + + inline miopenTensorDescriptor_t desc() { return desc_; } + + private: + miopenTensorDescriptor_t desc_; + DISABLE_COPY_AND_ASSIGN(ScopedTensorDescriptor); +}; + +class ScopedDropoutDescriptor { + public: + ScopedDropoutDescriptor() { + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::miopenCreateDropoutDescriptor(&desc_)); + } + ~ScopedDropoutDescriptor() PADDLE_MAY_THROW { + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::miopenDestroyDropoutDescriptor(desc_)); + } + + inline miopenDropoutDescriptor_t descriptor(const miopenHandle_t& handle, + const phi::Place& place, + bool initialized, + float dropout_prob_, + phi::DenseTensor* dropout_state_, + int seed, + size_t state_size) { + if (dropout_state_ == nullptr) { // for no dropout or test + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::miopenSetDropoutDescriptor(desc_, + handle, + 0 /* dropout */, + nullptr, + 0 /* state_size */, + 0 /* seed */, + false, + false, + MIOPEN_RNG_PSEUDO_XORWOW)); + return desc_; + } + auto* dropout_state_data = dropout_state_->data(); + if (!initialized) { + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::miopenSetDropoutDescriptor(desc_, + handle, + dropout_prob_, + dropout_state_data, + state_size, + seed, + false, + false, + MIOPEN_RNG_PSEUDO_XORWOW)); + } else { + auto dropout_state_dims = dropout_state_->dims(); + state_size = dropout_state_dims[0]; + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenRestoreDropoutDescriptor( + desc_, + handle, + dropout_prob_, + dropout_state_data, + state_size, + 0, + false, + false, + MIOPEN_RNG_PSEUDO_XORWOW)); + } + return desc_; + } + inline miopenDropoutDescriptor_t desc() { return desc_; } + + private: + miopenDropoutDescriptor_t desc_; + DISABLE_COPY_AND_ASSIGN(ScopedDropoutDescriptor); +}; + +class ScopedRNNDescriptor { + public: + ScopedRNNDescriptor() { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenCreateRNNDescriptor(&desc_)); + } + ~ScopedRNNDescriptor() PADDLE_MAY_THROW { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDestroyRNNDescriptor(desc_)); + } + + inline miopenRNNDescriptor_t desc() { return desc_; } + + private: + miopenRNNDescriptor_t desc_; + DISABLE_COPY_AND_ASSIGN(ScopedRNNDescriptor); +}; + +class ScopedFilterDescriptor { + public: + ScopedFilterDescriptor() { + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::miopenCreateTensorDescriptor(&desc_)); + } + ~ScopedFilterDescriptor() PADDLE_MAY_THROW { + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::miopenDestroyTensorDescriptor(desc_)); + } + + inline miopenTensorDescriptor_t descriptor(const miopenTensorFormat_t format, + const miopenDataType_t type, + const std::vector& kernel, + const int groups = 1) { + // filter layout: MCHW(MCDHW), where M is the number of + // output image channels, C is the number of input image channels, + // D is the depth of the filter, H is the height of the filter, and W is the + // width of the filter. + std::vector kernel_with_group(kernel.begin(), kernel.end()); + if (groups > 1) { + kernel_with_group[0] /= groups; + // NOTE: input filter(C) of the filter is already asserted to be C/groups. + } + std::vector stride_dim(kernel_with_group.size()); + stride_dim.push_back(1); + for (int k = kernel_with_group.size() - 2; k >= 0; k--) { + stride_dim[k] = stride_dim[k + 1] * kernel_with_group[k + 1]; + } + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor( + desc_, + type, + kernel_with_group.size(), + const_cast(kernel_with_group.data()), + const_cast(stride_dim.data()))); + return desc_; + } + + template + inline miopenTensorDescriptor_t descriptor(const DataLayout& order, + const std::vector& kernel, + const int groups = 1) { + return descriptor( + GetCudnnTensorFormat(order), CudnnDataType::type, kernel, groups); + } + + inline miopenTensorDescriptor_t desc() { return desc_; } + + private: + miopenTensorDescriptor_t desc_; + DISABLE_COPY_AND_ASSIGN(ScopedFilterDescriptor); +}; + +class ScopedConvolutionDescriptor { + public: + ScopedConvolutionDescriptor() { + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::miopenCreateConvolutionDescriptor(&desc_)); + } + ~ScopedConvolutionDescriptor() PADDLE_MAY_THROW { + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::miopenDestroyConvolutionDescriptor(desc_)); + } + + inline miopenConvolutionDescriptor_t descriptor( + miopenDataType_t type, + const std::vector& pads, + const std::vector& strides, + const std::vector& dilations) { + PADDLE_ENFORCE_EQ(pads.size(), + strides.size(), + phi::errors::InvalidArgument( + "The size of pads and strides should be equal. But " + "received size of pads is %d, size of strides is %d.", + pads.size(), + strides.size())); + PADDLE_ENFORCE_EQ( + pads.size(), + dilations.size(), + phi::errors::InvalidArgument( + "The size of pads and dilations should be equal. But received size " + "of pads is %d, size of dilations is %d.", + pads.size(), + dilations.size())); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenInitConvolutionNdDescriptor( + desc_, + pads.size(), + const_cast(pads.data()), + const_cast(strides.data()), + const_cast(dilations.data()), + miopenConvolution)); + return desc_; + } + + template + inline miopenConvolutionDescriptor_t descriptor( + const std::vector& pads, + const std::vector& strides, + const std::vector& dilations) { + return descriptor(CudnnDataType::type, pads, strides, dilations); + } + + private: + miopenConvolutionDescriptor_t desc_; + DISABLE_COPY_AND_ASSIGN(ScopedConvolutionDescriptor); +}; + +class ScopedPoolingDescriptor { + public: + ScopedPoolingDescriptor() { + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::miopenCreatePoolingDescriptor(&desc_)); + } + ~ScopedPoolingDescriptor() PADDLE_MAY_THROW { + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::miopenDestroyPoolingDescriptor(desc_)); + } + + inline miopenPoolingDescriptor_t descriptor(const PoolingMode& mode, + const std::vector& kernel, + const std::vector& pads, + const std::vector& strides) { + PADDLE_ENFORCE_EQ(kernel.size(), + pads.size(), + phi::errors::InvalidArgument( + "The size of kernel and pads should be equal. But " + "received size of kernel is %d, size of pads is %d.", + kernel.size(), + pads.size())); + PADDLE_ENFORCE_EQ( + kernel.size(), + strides.size(), + phi::errors::InvalidArgument( + "The size of kernel and strides should be equal. But " + "received size of kernel is %d, size of strides is %d.", + kernel.size(), + strides.size())); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetNdPoolingDescriptor( + desc_, + GetPoolingMode(mode), + kernel.size(), + const_cast(kernel.data()), + const_cast(pads.data()), + const_cast(strides.data()))); + return desc_; + } + + private: + miopenPoolingDescriptor_t desc_; + DISABLE_COPY_AND_ASSIGN(ScopedPoolingDescriptor); +}; + +class ScopedActivationDescriptor { + public: + ScopedActivationDescriptor() { + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::miopenCreateActivationDescriptor(&desc_)); + } + ~ScopedActivationDescriptor() PADDLE_MAY_THROW { + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::miopenDestroyActivationDescriptor(desc_)); + } + + template + inline miopenActivationDescriptor_t descriptor( + const std::string& act, double value_max = static_cast(0.)) { + double relu_ceiling = 0.0; + ActivationMode activation_mode = StringToActivationMode(act); + miopenActivationMode_t mode; + switch (activation_mode) { + case ActivationMode::kNone: + mode = miopenActivationPASTHRU; + break; + case ActivationMode::kRelu6: + relu_ceiling = 6.0; + mode = miopenActivationCLIPPEDRELU; + break; + case ActivationMode::kReluX: + relu_ceiling = value_max; + mode = miopenActivationCLIPPEDRELU; + break; + case ActivationMode::kRelu: + mode = miopenActivationRELU; + break; + case ActivationMode::kSigmoid: + mode = miopenActivationLOGISTIC; + break; + case ActivationMode::kTanh: + mode = miopenActivationTANH; + break; + default: + PADDLE_THROW(phi::errors::Unimplemented( + "Unrecognized MIOPEN activation mode: %d.", + static_cast(activation_mode))); + } + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetActivationDescriptor( + desc_, mode, relu_ceiling, 0.0, 0.0)); + return desc_; + } + + private: + miopenActivationDescriptor_t desc_; + DISABLE_COPY_AND_ASSIGN(ScopedActivationDescriptor); +}; + +class ScopedCTCLossDescriptor { + public: + ScopedCTCLossDescriptor() { + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::miopenCreateCTCLossDescriptor(&desc_)); + } + ~ScopedCTCLossDescriptor() PADDLE_MAY_THROW { + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::miopenDestroyCTCLossDescriptor(desc_)); + } + + template + inline miopenCTCLossDescriptor_t descriptor() { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetCTCLossDescriptor( + desc_, CudnnDataType::type, 0, false)); + return desc_; + } + + private: + miopenCTCLossDescriptor_t desc_; + DISABLE_COPY_AND_ASSIGN(ScopedCTCLossDescriptor); +}; + +} // namespace gpu +} // namespace backends +} // namespace phi diff --git a/paddle/phi/backends/gpu/musa/rocm_device_function.h b/paddle/phi/backends/gpu/musa/rocm_device_function.h new file mode 100644 index 0000000000000..6f5d684075f0f --- /dev/null +++ b/paddle/phi/backends/gpu/musa/rocm_device_function.h @@ -0,0 +1,165 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +// NOTE(): support float16 to half in header file. +#define PADDLE_CUDA_FP16 +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/common/complex.h" +#include "paddle/phi/common/float16.h" + +namespace phi { +namespace backends { +namespace gpu { + +#define CREATE_SHFL_MASK(mask, predicate) mask = __ballot((predicate)) + +#define CUDA_LAUNCH_KERNEL_BASE(dim, ...) \ + case (dim): { \ + constexpr auto kPowerOfTwoDim = (dim); \ + __VA_ARGS__; \ + } break + +#define CUDA_LAUNCH_KERNEL_HELPER(...) \ + CUDA_LAUNCH_KERNEL_BASE(1024, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(512, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(256, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(128, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(64, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(32, ##__VA_ARGS__); + +template +__forceinline__ __device__ T +CudaShuffleDownSync(unsigned mask, T val, int delta, int width = warpSize) { + return __shfl_down(val, delta, width); +} + +template +__forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, + T val, + int width = warpSize) { + return __shfl_xor(val, width); +} + +template <> +__forceinline__ __device__ phi::dtype::float16 CudaShuffleDownSync( + unsigned mask, phi::dtype::float16 val, int delta, int width) { + return phi::dtype::float16(__shfl_down( + static_cast(val), static_cast(delta), width)); +} + +template <> +__forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleDownSync( + unsigned mask, phi::dtype::bfloat16 val, int delta, int width) { + return phi::dtype::bfloat16(__shfl_down( + static_cast(val), static_cast(delta), width)); +} + +template <> +__forceinline__ __device__ phi::dtype::complex CudaShuffleDownSync( + unsigned mask, phi::dtype::complex val, int delta, int width) { + float real = __shfl_down(val.real, delta, width); + float imag = __shfl_down(val.imag, delta, width); + return phi::dtype::complex(real, imag); +} + +template <> +__forceinline__ __device__ phi::dtype::complex CudaShuffleDownSync( + unsigned mask, phi::dtype::complex val, int delta, int width) { + double real = __shfl_down(val.real, delta, width); + double imag = __shfl_down(val.imag, delta, width); + return phi::dtype::complex(real, imag); +} + +template <> +__forceinline__ __device__ phi::dtype::float16 CudaShuffleXorSync( + unsigned mask, phi::dtype::float16 val, int width) { + return phi::dtype::float16(__shfl_xor(static_cast(val), width)); +} + +template <> +__forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleXorSync( + unsigned mask, phi::dtype::bfloat16 val, int width) { + return phi::dtype::bfloat16(__shfl_xor(static_cast(val), width)); +} + +template <> +__forceinline__ __device__ phi::dtype::complex CudaShuffleXorSync( + unsigned mask, phi::dtype::complex val, int width) { + float real = __shfl_xor(val.real, width); + float imag = __shfl_xor(val.imag, width); + return phi::dtype::complex(real, imag); +} + +template <> +__forceinline__ __device__ phi::dtype::complex CudaShuffleXorSync( + unsigned mask, phi::dtype::complex val, int width) { + double real = __shfl_xor(val.real, width); + double imag = __shfl_xor(val.imag, width); + return phi::dtype::complex(real, imag); +} + +template +__forceinline__ __device__ T +CudaShuffleSync(unsigned mask, T val, int src_line, int width = 32) { + return __shfl(val, src_line, width); +} + +template +HOSTDEVICE T Infinity() { + return INFINITY; +} + +template +__device__ T reduceSum(T val, int tid, int len) { + // NOTE(zcd): The warp size should be taken from the + // parameters of the GPU but not specified as 32 simply. + // To make the reduceSum more efficiently, + // I use Warp-Level Parallelism and assume the Warp size + // is 32 which may be different for different GPU, + // but most card's warp size is 32. +#ifdef PADDLE_WITH_HIP + const int warpSize = 64; +#else + const int warpSize = 32; +#endif + __shared__ T shm[warpSize]; + unsigned mask = 0u; + CREATE_SHFL_MASK(mask, tid < len); + + for (int offset = warpSize / 2; offset > 0; offset /= 2) + val += phi::backends::gpu::CudaShuffleDownSync(mask, val, offset); + + if (tid < warpSize) shm[tid] = 0; + __syncthreads(); + + if (tid % warpSize == 0) { + shm[tid / warpSize] = val; + } + __syncthreads(); + + CREATE_SHFL_MASK(mask, tid < warpSize); + + if (tid < warpSize) { + val = shm[tid]; + for (int offset = warpSize / 2; offset > 0; offset /= 2) + val += phi::backends::gpu::CudaShuffleDownSync(mask, val, offset); + } + return val; +} + +} // namespace gpu +} // namespace backends +} // namespace phi diff --git a/paddle/phi/backends/gpu/musa/rocm_helper.h b/paddle/phi/backends/gpu/musa/rocm_helper.h new file mode 100644 index 0000000000000..07fdde5a2f417 --- /dev/null +++ b/paddle/phi/backends/gpu/musa/rocm_helper.h @@ -0,0 +1,74 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +namespace phi { +namespace backends { +namespace gpu { + +/* + * Summary: Grid stride looping macro in CUDA kernel + * + * [ Why need this macro? ] + * + * The original looping in CUDA kernel is: + * + * `for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + * i += blockDim.x * gridDim.x)` + * + * This for condition is risky. The value of `blockIdx.x * blockDim.x` + * may be large, such as over 1GB, the first iteration is no problem here, + * but when `i += blockDim.x * gridDim.x` is executed, the value of i + * will greater than INT_MAX and overflow becomes negative value, at + * this time, the cycle condition `i < (n)` is still satisfied, so it + * will cause illegal access to cuda memory. + * + * Here is a real example in ERINE, it will trigger above error. + * The related data are: + * - blockIdx.x = 2172938 + * - blockDim.x = 512 + * - blockIdx.x * blockDim.x = 1112543864 + * - INT_MAX = 2147483647 + * + * So we polish the for condition as follow, the int64_t __index__ will + * prevent overflow in the loop increment. + * + * Parameters: + * - i: loop index + * - num: total element numbers + * + * Examples: + * template + * __global__ void Scale(T* logit_grad, const T* loss_grad, const int num, + * const int d, const int remain) { + * CUDA_KERNEL_LOOP(index, num) { + * int idx_n = index / d; + * int idx_remain = index % remain; + * logit_grad[index] *= loss_grad[idx_n * remain + idx_remain]; + * } + * } + * + */ + +#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \ + int64_t __index__ = \ + static_cast(hipBlockIdx_x) * hipBlockDim_x + hipThreadIdx_x; \ + int64_t __stride__ = static_cast(hipBlockDim_x) * hipGridDim_x; \ + for (index_type i = __index__; __index__ < (num); \ + __index__ += __stride__, i = __index__) + +} // namespace gpu +} // namespace backends +} // namespace phi diff --git a/paddle/phi/backends/gpu/musa/rocm_info.cc b/paddle/phi/backends/gpu/musa/rocm_info.cc new file mode 100644 index 0000000000000..32c7c329253b1 --- /dev/null +++ b/paddle/phi/backends/gpu/musa/rocm_info.cc @@ -0,0 +1,334 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "paddle/phi/backends/gpu/gpu_info.h" + +#include "paddle/phi/core/enforce.h" + +static std::once_flag g_device_props_size_init_flag; +static std::vector> g_device_props_init_flags; +static std::vector g_device_props; + +namespace phi { +namespace backends { +namespace gpu { + +int DnnVersion() { + if (!dynload::HasCUDNN()) return -1; + size_t version_major, version_minor, version_patch; + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenGetVersion( + &version_major, &version_minor, &version_patch)); + return version_major * 100 + version_minor * 10 + version_patch; +} + +static int GetGPUDeviceCountImpl() { + int driverVersion = 0; + musaError_t status = musaDriverGetVersion(&driverVersion); + + if (!(status == gpuSuccess && driverVersion != 0)) { + // No GPU driver + VLOG(2) << "GPU Driver Version can't be detected. No GPU driver!"; + return 0; + } + + const auto *cuda_visible_devices = std::getenv("MUSA_VISIBLE_DEVICES"); + + if (cuda_visible_devices != nullptr) { + std::string cuda_visible_devices_str(cuda_visible_devices); + if (!cuda_visible_devices_str.empty()) { + cuda_visible_devices_str.erase( + 0, cuda_visible_devices_str.find_first_not_of('\'')); + cuda_visible_devices_str.erase( + cuda_visible_devices_str.find_last_not_of('\'') + 1); + cuda_visible_devices_str.erase( + 0, cuda_visible_devices_str.find_first_not_of('\"')); + cuda_visible_devices_str.erase( + cuda_visible_devices_str.find_last_not_of('\"') + 1); + } + if (std::all_of(cuda_visible_devices_str.begin(), + cuda_visible_devices_str.end(), + [](char ch) { return ch == ' '; })) { + VLOG(2) << "MUSA_VISIBLE_DEVICES is set to be " + "empty. No GPU detected."; + return 0; + } + } + int count; + PADDLE_ENFORCE_GPU_SUCCESS(musaGetDeviceCount(&count)); + return count; +} + +int GetGPUDeviceCount() { + // cache the count + static auto dev_cnt = GetGPUDeviceCountImpl(); + return dev_cnt; +} + +int GetGPUComputeCapability(int id) { + PADDLE_ENFORCE_LT( + id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, + GetGPUDeviceCount())); + int major, minor; + auto major_error_code = musaDeviceGetAttribute( + &major, musaDeviceAttributeComputeCapabilityMajor, id); + auto minor_error_code = musaDeviceGetAttribute( + &minor, musaDeviceAttributeComputeCapabilityMinor, id); + + PADDLE_ENFORCE_GPU_SUCCESS(major_error_code); + PADDLE_ENFORCE_GPU_SUCCESS(minor_error_code); + return major * 100 + minor; +} + +int GetGPURuntimeVersion(int id) { + PADDLE_ENFORCE_LT( + id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, + GetGPUDeviceCount())); + int runtime_version = 0; + PADDLE_ENFORCE_GPU_SUCCESS(musaRuntimeGetVersion(&runtime_version)); + return runtime_version; +} + +int GetGPUDriverVersion(int id) { + PADDLE_ENFORCE_LT( + id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, + GetGPUDeviceCount())); + int driver_version = 0; + PADDLE_ENFORCE_GPU_SUCCESS(musaDriverGetVersion(&driver_version)); + return driver_version; +} + +bool TensorCoreAvailable() { return false; } + +int GetGPUMultiProcessors(int id) { + PADDLE_ENFORCE_LT( + id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, + GetGPUDeviceCount())); + int count; + PADDLE_ENFORCE_GPU_SUCCESS( + musaDeviceGetAttribute(&count, musaDeviceAttributeMultiprocessorCount, id)); + return count; +} + +int GetGPUMaxThreadsPerMultiProcessor(int id) { + PADDLE_ENFORCE_LT( + id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, + GetGPUDeviceCount())); + int count; + PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceGetAttribute( + &count, musaDeviceAttributeMaxThreadsPerMultiProcessor, id)); + + return count; +} + +int GetGPUMaxThreadsPerBlock(int id) { + PADDLE_ENFORCE_LT( + id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, + GetGPUDeviceCount())); + int count; + PADDLE_ENFORCE_GPU_SUCCESS( + musaDeviceGetAttribute(&count, musaDeviceAttributeMaxThreadsPerBlock, id)); + return count; +} + +int GetCurrentDeviceId() { + int device_id; + PADDLE_ENFORCE_GPU_SUCCESS(musaGetDevice(&device_id)); + return device_id; +} + +std::array GetGpuMaxGridDimSize(int id) { + PADDLE_ENFORCE_LT( + id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, + GetGPUDeviceCount())); + std::array ret; + int size; + auto error_code_x = + musaDeviceGetAttribute(&size, musaDeviceAttributeMaxGridDimX, id); + PADDLE_ENFORCE_GPU_SUCCESS(error_code_x); + ret[0] = size; + + auto error_code_y = + musaDeviceGetAttribute(&size, musaDeviceAttributeMaxGridDimY, id); + PADDLE_ENFORCE_GPU_SUCCESS(error_code_y); + ret[1] = size; + + auto error_code_z = + musaDeviceGetAttribute(&size, musaDeviceAttributeMaxGridDimZ, id); + PADDLE_ENFORCE_GPU_SUCCESS(error_code_z); + ret[2] = size; + return ret; +} + +std::pair GetGpuStreamPriorityRange() { + int least_priority, greatest_priority; + PADDLE_ENFORCE_GPU_SUCCESS( + musaDeviceGetStreamPriorityRange(&least_priority, &greatest_priority)); + return std::make_pair(least_priority, greatest_priority); +} + +const gpuDeviceProp &GetDeviceProperties(int id) { + std::call_once(g_device_props_size_init_flag, [&] { + int gpu_num = 0; + gpu_num = GetGPUDeviceCount(); + g_device_props_init_flags.resize(gpu_num); + g_device_props.resize(gpu_num); + for (int i = 0; i < gpu_num; ++i) { + g_device_props_init_flags[i] = std::make_unique(); + } + }); + + if (id == -1) { + id = GetCurrentDeviceId(); + } + + if (id < 0 || id >= static_cast(g_device_props.size())) { + PADDLE_THROW(phi::errors::OutOfRange( + "The device id %d is out of range [0, %d), where %d is the number of " + "devices on this machine. Because the device id should be greater than " + "or equal to zero and smaller than the number of gpus. Please input " + "appropriate device again!", + id, + static_cast(g_device_props.size()), + static_cast(g_device_props.size()))); + } + + std::call_once(*(g_device_props_init_flags[id]), [&] { + PADDLE_ENFORCE_GPU_SUCCESS(musaGetDeviceProperties(&g_device_props[id], id)); + }); + + return g_device_props[id]; +} + +void SetDeviceId(int id) { + // TODO(qijun): find a better way to cache the cuda device count + PADDLE_ENFORCE_LT( + id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, + GetGPUDeviceCount())); + PADDLE_RETRY_CUDA_SUCCESS(musaSetDevice(id)); +} + +void GpuMemcpyAsync(void *dst, + const void *src, + size_t count, + gpuMemcpyKind kind, + gpuStream_t stream) { + PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpyAsync(dst, src, count, kind, stream)); +} + +void GpuMemcpySync(void *dst, + const void *src, + size_t count, + gpuMemcpyKind kind) { + PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpy(dst, src, count, kind)); +} + +void GpuMemcpyPeerAsync(void *dst, + int dst_device, + const void *src, + int src_device, + size_t count, + gpuStream_t stream) { + PADDLE_ENFORCE_GPU_SUCCESS( + musaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream)); +} + +void GpuMemcpyPeerSync( + void *dst, int dst_device, const void *src, int src_device, size_t count) { + PADDLE_ENFORCE_GPU_SUCCESS( + musaMemcpyPeer(dst, dst_device, src, src_device, count)); +} + +void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream) { + PADDLE_ENFORCE_GPU_SUCCESS(musaMemsetAsync(dst, value, count, stream)); +} + +void GpuStreamSync(gpuStream_t stream) { + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamSynchronize(stream)); +} + +void GpuDestroyStream(gpuStream_t stream) { + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamDestroy(stream)); +} + +void GpuDeviceSync() { PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize()); } + +gpuError_t GpuGetLastError() { return musaGetLastError(); } + +bool IsGPUManagedMemorySupported(int dev_id) { + PADDLE_ENFORCE_LT( + dev_id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + dev_id, + GetGPUDeviceCount())); + // TODO(qili93): Hygon DTK (21.04 and 22.04) not support + // musaDeviceAttributeManagedMemory, temporary disable by default, to be + // verified in next DTK release + return false; +} + +bool IsGPUManagedMemoryOversubscriptionSupported(int dev_id) { + PADDLE_ENFORCE_LT( + dev_id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + dev_id, + GetGPUDeviceCount())); +#ifdef __linux__ + return IsGPUManagedMemorySupported(dev_id) && + GetGPUComputeCapability(dev_id) >= 60; +#else + return false; +#endif +} + +} // namespace gpu +} // namespace backends +} // namespace phi From c34f22d0ee4b7774c6f343f723c910583d4bf88f Mon Sep 17 00:00:00 2001 From: Xiaokang Shang Date: Tue, 25 Jul 2023 17:13:23 +0800 Subject: [PATCH 04/55] remove paddle/phi/backends/gpu/musa files --- paddle/phi/backends/gpu/musa/miopen_desc.h | 264 -------- paddle/phi/backends/gpu/musa/miopen_helper.h | 595 ------------------ paddle/phi/backends/gpu/musa/musa_info.cc | 32 +- .../backends/gpu/musa/rocm_device_function.h | 165 ----- paddle/phi/backends/gpu/musa/rocm_helper.h | 74 --- paddle/phi/backends/gpu/musa/rocm_info.cc | 334 ---------- 6 files changed, 16 insertions(+), 1448 deletions(-) delete mode 100644 paddle/phi/backends/gpu/musa/miopen_desc.h delete mode 100644 paddle/phi/backends/gpu/musa/miopen_helper.h delete mode 100644 paddle/phi/backends/gpu/musa/rocm_device_function.h delete mode 100644 paddle/phi/backends/gpu/musa/rocm_helper.h delete mode 100644 paddle/phi/backends/gpu/musa/rocm_info.cc diff --git a/paddle/phi/backends/gpu/musa/miopen_desc.h b/paddle/phi/backends/gpu/musa/miopen_desc.h deleted file mode 100644 index ae0e274ca650e..0000000000000 --- a/paddle/phi/backends/gpu/musa/miopen_desc.h +++ /dev/null @@ -1,264 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "paddle/phi/backends/gpu/rocm/miopen_helper.h" -#include "paddle/phi/core/utils/data_type.h" - -namespace phi { -namespace backends { -namespace gpu { - -inline std::vector TransformDimOrder(const std::vector& dims) { - std::vector transformed_dims(dims.begin(), dims.end()); - int H, W, D, C; - if (dims.size() == 4) { - H = dims[1]; - W = dims[2]; - C = dims[3]; - transformed_dims[1] = C; - transformed_dims[2] = H; - transformed_dims[3] = W; - } else { - D = dims[1]; - H = dims[2]; - W = dims[3]; - C = dims[4]; - transformed_dims[1] = C; - transformed_dims[2] = D; - transformed_dims[3] = H; - transformed_dims[4] = W; - } - return transformed_dims; -} - -inline miopenDataType_t ToCudnnDataType(const phi::DataType& t) { - miopenDataType_t type = miopenFloat; - switch (t) { - case phi::DataType::FLOAT16: - type = miopenHalf; - break; - case phi::DataType::FLOAT32: - type = miopenFloat; - break; - default: - break; - } - return type; -} - -class ActivationDescriptor { - public: - using T = miopenActivationDescriptor; - struct Deleter { - void operator()(T* t) { - if (t != nullptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::miopenDestroyActivationDescriptor(t)); - t = nullptr; - } - } - }; - ActivationDescriptor() { - T* raw_ptr; - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::miopenCreateActivationDescriptor(&raw_ptr)); - desc_.reset(raw_ptr); - } - template - void set(miopenActivationMode_t mode, const T& coef) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetActivationDescriptor( - desc_.get(), mode, static_cast(coef), 0.0, 0.0)); - } - - T* desc() { return desc_.get(); } - T* desc() const { return desc_.get(); } - - private: - std::unique_ptr desc_; -}; - -class TensorDescriptor { - public: - using T = miopenTensorDescriptor; - struct Deleter { - void operator()(T* t) { - if (t != nullptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::miopenDestroyTensorDescriptor(t)); - t = nullptr; - } - } - }; - TensorDescriptor() { - T* raw_ptr; - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::miopenCreateTensorDescriptor(&raw_ptr)); - desc_.reset(raw_ptr); - } - T* desc() { return desc_.get(); } - T* desc() const { return desc_.get(); } - - void set(const phi::DenseTensor& tensor, const int groups = 1) { - auto dims = phi::vectorize(tensor.dims()); - std::vector strides(dims.size()); - strides[dims.size() - 1] = 1; - for (int i = dims.size() - 2; i >= 0; i--) { - strides[i] = dims[i + 1] * strides[i + 1]; - } - std::vector dims_with_group(dims.begin(), dims.end()); - if (groups > 1) { - dims_with_group[1] = dims_with_group[1] / groups; - } - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor( - (miopenTensorDescriptor_t)(desc_.get()), - ToCudnnDataType(tensor.dtype()), - static_cast(dims_with_group.size()), - const_cast(dims_with_group.data()), - const_cast(strides.data()))); - } - - void set(const phi::DenseTensor& tensor, const miopenTensorFormat_t format) { - const int groups = 1; - PADDLE_ENFORCE_EQ( - format, - MIOPEN_TENSOR_NCHW, - phi::errors::InvalidArgument("format should ONLY be NCHW in MIOPEN.")); - auto dims = phi::vectorize(tensor.dims()); - std::vector strides(dims.size()); - strides[dims.size() - 1] = 1; - for (int i = dims.size() - 2; i >= 0; i--) { - strides[i] = dims[i + 1] * strides[i + 1]; - } - std::vector dims_with_group(dims.begin(), dims.end()); - if (groups > 1) { - dims_with_group[1] = dims_with_group[1] / groups; - } - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor( - (miopenTensorDescriptor_t)(desc_.get()), - ToCudnnDataType(tensor.dtype()), - static_cast(dims_with_group.size()), - const_cast(dims_with_group.data()), - const_cast(strides.data()))); - } - - private: - std::unique_ptr desc_; -}; - -class FilterDescriptor { - public: - using T = miopenTensorDescriptor; - struct Deleter { - void operator()(T* t) { - if (t != nullptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::miopenDestroyTensorDescriptor(t)); - t = nullptr; - } - } - }; - FilterDescriptor() { - T* raw_ptr; - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::miopenCreateTensorDescriptor(&raw_ptr)); - desc_.reset(raw_ptr); - } - T* desc() { return desc_.get(); } - T* desc() const { return desc_.get(); } - - void set(const phi::DenseTensor& tensor, - const miopenTensorFormat_t format, - const int groups = 1) { - PADDLE_ENFORCE_EQ( - format, - MIOPEN_TENSOR_NCHW, - phi::errors::InvalidArgument("format should ONLY be NCHW in MIOPEN.")); - auto dims = phi::vectorize(tensor.dims()); - std::vector strides(dims.size()); - strides[dims.size() - 1] = 1; - for (int i = dims.size() - 2; i >= 0; i--) { - strides[i] = dims[i + 1] * strides[i + 1]; - } - std::vector dims_with_group(dims.begin(), dims.end()); - if (groups > 1) { - dims_with_group[1] = dims_with_group[1] / groups; - } - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor( - (miopenTensorDescriptor_t)(desc_.get()), - ToCudnnDataType(tensor.dtype()), - static_cast(dims_with_group.size()), - const_cast(dims_with_group.data()), - const_cast(strides.data()))); - } - - private: - std::unique_ptr desc_; -}; - -class ConvolutionDescriptor { - public: - using T = miopenConvolutionDescriptor; - struct Deleter { - void operator()(T* t) { - if (t != nullptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::miopenDestroyConvolutionDescriptor(t)); - t = nullptr; - } - } - }; - ConvolutionDescriptor() { - T* raw_ptr; - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::miopenCreateConvolutionDescriptor(&raw_ptr)); - desc_.reset(raw_ptr); - } - T* desc() { return desc_.get(); } - T* desc() const { return desc_.get(); } - - void set(miopenDataType_t dtype, - const std::vector& pads, - const std::vector& strides, - const std::vector& dilations, - bool allow_tf32, - const int groups = 1) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenInitConvolutionNdDescriptor( - (miopenConvolutionDescriptor_t)desc_.get(), - static_cast(pads.size()), - const_cast(pads.data()), - const_cast(strides.data()), - const_cast(dilations.data()), - miopenConvolution)); - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetConvolutionGroupCount( - (miopenConvolutionDescriptor_t)desc_.get(), groups)); - } - - private: - std::unique_ptr desc_; -}; - -} // namespace gpu -} // namespace backends -} // namespace phi diff --git a/paddle/phi/backends/gpu/musa/miopen_helper.h b/paddle/phi/backends/gpu/musa/miopen_helper.h deleted file mode 100644 index 095f32ba460d0..0000000000000 --- a/paddle/phi/backends/gpu/musa/miopen_helper.h +++ /dev/null @@ -1,595 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include - -#include "gflags/gflags.h" - -#include "paddle/phi/backends/dynload/miopen.h" -#include "paddle/phi/common/bfloat16.h" -#include "paddle/phi/common/float16.h" -#include "paddle/phi/common/place.h" -#include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/enforce.h" -#include "paddle/phi/core/errors.h" -#include "paddle/phi/core/macros.h" - -// MIOPEN do not have epslion definition -#define CUDNN_BN_MIN_EPSILON 1e-05 - -DECLARE_bool(cudnn_deterministic); - -namespace phi { -namespace backends { -namespace gpu { - -inline const char* miopenGetErrorString(miopenStatus_t status) { - switch (status) { - case miopenStatusSuccess: - return "miopenStatusSuccess"; - case miopenStatusNotInitialized: - return "miopenStatusNotInitialized"; - case miopenStatusAllocFailed: - return "miopenStatusAllocFailed"; - case miopenStatusBadParm: - return "miopenStatusBadParm"; - case miopenStatusInternalError: - return "miopenStatusInternalError"; - case miopenStatusInvalidValue: - return "miopenStatusInvalidValue"; - case miopenStatusUnknownError: - return "miopenStatusUnknownError"; - case miopenStatusNotImplemented: - return "miopenStatusNotImplemented"; - default: - return "Unknown miopen error number"; - } -} - -// no use, but will have compiling error if not defined -#define CUDNN_VERSION_MIN(major, minor, patch) \ - (CUDNN_VERSION >= ((major)*1000 + (minor)*100 + (patch))) - -enum class DataLayout { // Not use - kNHWC, - kNCHW, - kNCDHW, - kNDHWC, // add, liyamei - kNCHW_VECT_C, -}; - -enum class PoolingMode { - kMaximum, - kMaximumDeterministic, - kAverageExclusive, - kAverageInclusive, -}; - -enum class ActivationMode { - kNone, // activation identity - kSigmoid, - kRelu, - kRelu6, - kReluX, - kTanh, - kBandPass, -}; - -inline miopenPoolingMode_t GetPoolingMode(const PoolingMode& mode) { - switch (mode) { - case PoolingMode::kMaximumDeterministic: - return miopenPoolingMax; - case PoolingMode::kAverageExclusive: - return miopenPoolingAverage; - case PoolingMode::kAverageInclusive: - return miopenPoolingAverageInclusive; - case PoolingMode::kMaximum: - return miopenPoolingMax; - default: - PADDLE_THROW( - phi::errors::Unimplemented("Unexpected MIOPEN pooling mode.")); - } -} - -inline ActivationMode StringToActivationMode(const std::string& str) { - if (str == "identity") { - return ActivationMode::kNone; - } else if (str == "sigmoid") { - return ActivationMode::kSigmoid; - } else if (str == "relu") { - return ActivationMode::kRelu; - } else if (str == "relu6") { - return ActivationMode::kRelu6; - } else if (str == "relux") { - return ActivationMode::kReluX; - } else if (str == "tanh") { - return ActivationMode::kTanh; - } else if (str == "bandpass") { - return ActivationMode::kBandPass; - } else { - PADDLE_THROW(phi::errors::Unimplemented( - "Unknown MIOPEN activation string: %s.", str)); - } -} - -template -class CudnnDataType; - -template <> -class CudnnDataType { - public: - static const miopenDataType_t type = miopenHalf; - // The scaling param type is float for HALF and FLOAT tensors - using ScalingParamType = const float; - using BatchNormParamType = float; - static ScalingParamType* kOne() { - static ScalingParamType v = 1.0; - return &v; - } - static ScalingParamType* kZero() { - static ScalingParamType v = 0.0; - return &v; - } -}; - -template <> -class CudnnDataType { - public: - static const miopenDataType_t type = miopenBFloat16; - // The scaling param type is float for HALF and FLOAT tensors - using ScalingParamType = const float; - using BatchNormParamType = float; - static ScalingParamType* kOne() { - static ScalingParamType v = 1.0; - return &v; - } - static ScalingParamType* kZero() { - static ScalingParamType v = 0.0; - return &v; - } -}; - -template <> -class CudnnDataType { - public: - static const miopenDataType_t type = miopenFloat; - using ScalingParamType = const float; - using BatchNormParamType = float; - static ScalingParamType* kOne() { - static ScalingParamType v = 1.0; - return &v; - } - static ScalingParamType* kZero() { - static ScalingParamType v = 0.0; - return &v; - } -}; - -inline miopenTensorFormat_t GetCudnnTensorFormat(const DataLayout& order) { - switch (order) { - case DataLayout::kNHWC: - return MIOPEN_TENSOR_NHWC; - case DataLayout::kNCHW: - return MIOPEN_TENSOR_NCHW; - case DataLayout::kNCDHW: - return MIOPEN_TENSOR_NCHW; - case DataLayout::kNDHWC: - return MIOPEN_TENSOR_NHWC; - default: - PADDLE_THROW(phi::errors::Unimplemented( - "MIOPEN has no equivalent dataLayout for input order.")); - } - return MIOPEN_TENSOR_NCHW; -} - -class ScopedTensorDescriptor { - public: - ScopedTensorDescriptor() { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::miopenCreateTensorDescriptor(&desc_)); - } - ~ScopedTensorDescriptor() PADDLE_MAY_THROW { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::miopenDestroyTensorDescriptor(desc_)); - } - - inline miopenTensorDescriptor_t descriptor(const miopenTensorFormat_t format, - const miopenDataType_t type, - const std::vector& dims, - const int groups = 1) { - // the format is not used now, will add later - std::vector strides(dims.size()); - strides[dims.size() - 1] = 1; - for (int i = dims.size() - 2; i >= 0; i--) { - strides[i] = dims[i + 1] * strides[i + 1]; - } - // Update tensor descriptor dims setting if groups > 1 - // NOTE: Here, Assume using NCHW or NCDHW order - std::vector dims_with_group(dims.begin(), dims.end()); - if (groups > 1) { - dims_with_group[1] = dims_with_group[1] / groups; - } - - // MIOPEN ONLY support data layout of NCHW - PADDLE_ENFORCE_EQ( - format, - MIOPEN_TENSOR_NCHW, - phi::errors::InvalidArgument("format should ONLY be NCHW in MIOPEN.")); - if (dims.size() == 4) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor( - desc_, - type, - dims_with_group.size(), - const_cast(dims_with_group.data()), - const_cast(strides.data()))); - } else if (dims.size() == 5) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor( - desc_, - type, - dims_with_group.size(), - const_cast(dims_with_group.data()), - const_cast(strides.data()))); - } - return desc_; - } - - template - inline miopenTensorDescriptor_t descriptor(const DataLayout& order, - const std::vector& dims, - const int groups = 1) { - return descriptor( - GetCudnnTensorFormat(order), CudnnDataType::type, dims, groups); - } - - inline miopenTensorDescriptor_t descriptor(const miopenDataType_t miopen_type, - const std::vector& dim, - const std::vector& stride) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor( - desc_, - miopen_type, - dim.size(), - const_cast(dim.data()), - const_cast(stride.data()))); - return desc_; - } - - template - inline miopenTensorDescriptor_t descriptor(const std::vector& dim, - const std::vector& stride) { - return descriptor(CudnnDataType::type, dim, stride); - } - - inline miopenTensorDescriptor_t desc() { return desc_; } - - private: - miopenTensorDescriptor_t desc_; - DISABLE_COPY_AND_ASSIGN(ScopedTensorDescriptor); -}; - -class ScopedDropoutDescriptor { - public: - ScopedDropoutDescriptor() { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::miopenCreateDropoutDescriptor(&desc_)); - } - ~ScopedDropoutDescriptor() PADDLE_MAY_THROW { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::miopenDestroyDropoutDescriptor(desc_)); - } - - inline miopenDropoutDescriptor_t descriptor(const miopenHandle_t& handle, - const phi::Place& place, - bool initialized, - float dropout_prob_, - phi::DenseTensor* dropout_state_, - int seed, - size_t state_size) { - if (dropout_state_ == nullptr) { // for no dropout or test - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::miopenSetDropoutDescriptor(desc_, - handle, - 0 /* dropout */, - nullptr, - 0 /* state_size */, - 0 /* seed */, - false, - false, - MIOPEN_RNG_PSEUDO_XORWOW)); - return desc_; - } - auto* dropout_state_data = dropout_state_->data(); - if (!initialized) { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::miopenSetDropoutDescriptor(desc_, - handle, - dropout_prob_, - dropout_state_data, - state_size, - seed, - false, - false, - MIOPEN_RNG_PSEUDO_XORWOW)); - } else { - auto dropout_state_dims = dropout_state_->dims(); - state_size = dropout_state_dims[0]; - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenRestoreDropoutDescriptor( - desc_, - handle, - dropout_prob_, - dropout_state_data, - state_size, - 0, - false, - false, - MIOPEN_RNG_PSEUDO_XORWOW)); - } - return desc_; - } - inline miopenDropoutDescriptor_t desc() { return desc_; } - - private: - miopenDropoutDescriptor_t desc_; - DISABLE_COPY_AND_ASSIGN(ScopedDropoutDescriptor); -}; - -class ScopedRNNDescriptor { - public: - ScopedRNNDescriptor() { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenCreateRNNDescriptor(&desc_)); - } - ~ScopedRNNDescriptor() PADDLE_MAY_THROW { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDestroyRNNDescriptor(desc_)); - } - - inline miopenRNNDescriptor_t desc() { return desc_; } - - private: - miopenRNNDescriptor_t desc_; - DISABLE_COPY_AND_ASSIGN(ScopedRNNDescriptor); -}; - -class ScopedFilterDescriptor { - public: - ScopedFilterDescriptor() { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::miopenCreateTensorDescriptor(&desc_)); - } - ~ScopedFilterDescriptor() PADDLE_MAY_THROW { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::miopenDestroyTensorDescriptor(desc_)); - } - - inline miopenTensorDescriptor_t descriptor(const miopenTensorFormat_t format, - const miopenDataType_t type, - const std::vector& kernel, - const int groups = 1) { - // filter layout: MCHW(MCDHW), where M is the number of - // output image channels, C is the number of input image channels, - // D is the depth of the filter, H is the height of the filter, and W is the - // width of the filter. - std::vector kernel_with_group(kernel.begin(), kernel.end()); - if (groups > 1) { - kernel_with_group[0] /= groups; - // NOTE: input filter(C) of the filter is already asserted to be C/groups. - } - std::vector stride_dim(kernel_with_group.size()); - stride_dim.push_back(1); - for (int k = kernel_with_group.size() - 2; k >= 0; k--) { - stride_dim[k] = stride_dim[k + 1] * kernel_with_group[k + 1]; - } - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor( - desc_, - type, - kernel_with_group.size(), - const_cast(kernel_with_group.data()), - const_cast(stride_dim.data()))); - return desc_; - } - - template - inline miopenTensorDescriptor_t descriptor(const DataLayout& order, - const std::vector& kernel, - const int groups = 1) { - return descriptor( - GetCudnnTensorFormat(order), CudnnDataType::type, kernel, groups); - } - - inline miopenTensorDescriptor_t desc() { return desc_; } - - private: - miopenTensorDescriptor_t desc_; - DISABLE_COPY_AND_ASSIGN(ScopedFilterDescriptor); -}; - -class ScopedConvolutionDescriptor { - public: - ScopedConvolutionDescriptor() { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::miopenCreateConvolutionDescriptor(&desc_)); - } - ~ScopedConvolutionDescriptor() PADDLE_MAY_THROW { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::miopenDestroyConvolutionDescriptor(desc_)); - } - - inline miopenConvolutionDescriptor_t descriptor( - miopenDataType_t type, - const std::vector& pads, - const std::vector& strides, - const std::vector& dilations) { - PADDLE_ENFORCE_EQ(pads.size(), - strides.size(), - phi::errors::InvalidArgument( - "The size of pads and strides should be equal. But " - "received size of pads is %d, size of strides is %d.", - pads.size(), - strides.size())); - PADDLE_ENFORCE_EQ( - pads.size(), - dilations.size(), - phi::errors::InvalidArgument( - "The size of pads and dilations should be equal. But received size " - "of pads is %d, size of dilations is %d.", - pads.size(), - dilations.size())); - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenInitConvolutionNdDescriptor( - desc_, - pads.size(), - const_cast(pads.data()), - const_cast(strides.data()), - const_cast(dilations.data()), - miopenConvolution)); - return desc_; - } - - template - inline miopenConvolutionDescriptor_t descriptor( - const std::vector& pads, - const std::vector& strides, - const std::vector& dilations) { - return descriptor(CudnnDataType::type, pads, strides, dilations); - } - - private: - miopenConvolutionDescriptor_t desc_; - DISABLE_COPY_AND_ASSIGN(ScopedConvolutionDescriptor); -}; - -class ScopedPoolingDescriptor { - public: - ScopedPoolingDescriptor() { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::miopenCreatePoolingDescriptor(&desc_)); - } - ~ScopedPoolingDescriptor() PADDLE_MAY_THROW { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::miopenDestroyPoolingDescriptor(desc_)); - } - - inline miopenPoolingDescriptor_t descriptor(const PoolingMode& mode, - const std::vector& kernel, - const std::vector& pads, - const std::vector& strides) { - PADDLE_ENFORCE_EQ(kernel.size(), - pads.size(), - phi::errors::InvalidArgument( - "The size of kernel and pads should be equal. But " - "received size of kernel is %d, size of pads is %d.", - kernel.size(), - pads.size())); - PADDLE_ENFORCE_EQ( - kernel.size(), - strides.size(), - phi::errors::InvalidArgument( - "The size of kernel and strides should be equal. But " - "received size of kernel is %d, size of strides is %d.", - kernel.size(), - strides.size())); - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetNdPoolingDescriptor( - desc_, - GetPoolingMode(mode), - kernel.size(), - const_cast(kernel.data()), - const_cast(pads.data()), - const_cast(strides.data()))); - return desc_; - } - - private: - miopenPoolingDescriptor_t desc_; - DISABLE_COPY_AND_ASSIGN(ScopedPoolingDescriptor); -}; - -class ScopedActivationDescriptor { - public: - ScopedActivationDescriptor() { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::miopenCreateActivationDescriptor(&desc_)); - } - ~ScopedActivationDescriptor() PADDLE_MAY_THROW { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::miopenDestroyActivationDescriptor(desc_)); - } - - template - inline miopenActivationDescriptor_t descriptor( - const std::string& act, double value_max = static_cast(0.)) { - double relu_ceiling = 0.0; - ActivationMode activation_mode = StringToActivationMode(act); - miopenActivationMode_t mode; - switch (activation_mode) { - case ActivationMode::kNone: - mode = miopenActivationPASTHRU; - break; - case ActivationMode::kRelu6: - relu_ceiling = 6.0; - mode = miopenActivationCLIPPEDRELU; - break; - case ActivationMode::kReluX: - relu_ceiling = value_max; - mode = miopenActivationCLIPPEDRELU; - break; - case ActivationMode::kRelu: - mode = miopenActivationRELU; - break; - case ActivationMode::kSigmoid: - mode = miopenActivationLOGISTIC; - break; - case ActivationMode::kTanh: - mode = miopenActivationTANH; - break; - default: - PADDLE_THROW(phi::errors::Unimplemented( - "Unrecognized MIOPEN activation mode: %d.", - static_cast(activation_mode))); - } - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetActivationDescriptor( - desc_, mode, relu_ceiling, 0.0, 0.0)); - return desc_; - } - - private: - miopenActivationDescriptor_t desc_; - DISABLE_COPY_AND_ASSIGN(ScopedActivationDescriptor); -}; - -class ScopedCTCLossDescriptor { - public: - ScopedCTCLossDescriptor() { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::miopenCreateCTCLossDescriptor(&desc_)); - } - ~ScopedCTCLossDescriptor() PADDLE_MAY_THROW { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::miopenDestroyCTCLossDescriptor(desc_)); - } - - template - inline miopenCTCLossDescriptor_t descriptor() { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetCTCLossDescriptor( - desc_, CudnnDataType::type, 0, false)); - return desc_; - } - - private: - miopenCTCLossDescriptor_t desc_; - DISABLE_COPY_AND_ASSIGN(ScopedCTCLossDescriptor); -}; - -} // namespace gpu -} // namespace backends -} // namespace phi diff --git a/paddle/phi/backends/gpu/musa/musa_info.cc b/paddle/phi/backends/gpu/musa/musa_info.cc index 6579ce63f21f6..f2087e4d7f4fc 100644 --- a/paddle/phi/backends/gpu/musa/musa_info.cc +++ b/paddle/phi/backends/gpu/musa/musa_info.cc @@ -88,16 +88,15 @@ int GetGPUComputeCapability(int id) { "but received id is: %d. GPU count is: %d.", id, GetGPUDeviceCount())); - return 100; - //int major, minor; - //auto major_error_code = musaDeviceGetAttribute( - // &major, musaDeviceAttributeComputeCapabilityMajor, id); - //auto minor_error_code = musaDeviceGetAttribute( - // &minor, musaDeviceAttributeComputeCapabilityMinor, id); - - //PADDLE_ENFORCE_GPU_SUCCESS(major_error_code); - //PADDLE_ENFORCE_GPU_SUCCESS(minor_error_code); - //return major * 100 + minor; + int major, minor; + auto major_error_code = musaDeviceGetAttribute( + &major, musaDevAttrComputeCapabilityMajor, id); + auto minor_error_code = musaDeviceGetAttribute( + &minor, musaDevAttrComputeCapabilityMinor, id); + + PADDLE_ENFORCE_GPU_SUCCESS(major_error_code); + PADDLE_ENFORCE_GPU_SUCCESS(minor_error_code); + return major * 100 + minor; } int GetGPURuntimeVersion(int id) { @@ -138,7 +137,8 @@ int GetGPUMultiProcessors(int id) { GetGPUDeviceCount())); int count; PADDLE_ENFORCE_GPU_SUCCESS( - musaDeviceGetAttribute(&count, musaDeviceAttributeMultiprocessorCount, id)); + + musaDeviceGetAttribute(&count, musaDevAttrMultiProcessorCount, id)); return count; } @@ -152,7 +152,7 @@ int GetGPUMaxThreadsPerMultiProcessor(int id) { GetGPUDeviceCount())); int count; PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceGetAttribute( - &count, musaDeviceAttributeMaxThreadsPerMultiProcessor, id)); + &count, musaDevAttrMaxThreadsPerMultiProcessor, id)); return count; } @@ -167,7 +167,7 @@ int GetGPUMaxThreadsPerBlock(int id) { GetGPUDeviceCount())); int count; PADDLE_ENFORCE_GPU_SUCCESS( - musaDeviceGetAttribute(&count, musaDeviceAttributeMaxThreadsPerBlock, id)); + musaDeviceGetAttribute(&count, musaDevAttrMaxThreadsPerBlock, id)); return count; } @@ -188,17 +188,17 @@ std::array GetGpuMaxGridDimSize(int id) { std::array ret; int size; auto error_code_x = - musaDeviceGetAttribute(&size, musaDeviceAttributeMaxGridDimX, id); + musaDeviceGetAttribute(&size, musaDevAttrMaxGridDimX, id); PADDLE_ENFORCE_GPU_SUCCESS(error_code_x); ret[0] = size; auto error_code_y = - musaDeviceGetAttribute(&size, musaDeviceAttributeMaxGridDimY, id); + musaDeviceGetAttribute(&size, musaDevAttrMaxGridDimY, id); PADDLE_ENFORCE_GPU_SUCCESS(error_code_y); ret[1] = size; auto error_code_z = - musaDeviceGetAttribute(&size, musaDeviceAttributeMaxGridDimZ, id); + musaDeviceGetAttribute(&size, musaDevAttrMaxGridDimZ, id); PADDLE_ENFORCE_GPU_SUCCESS(error_code_z); ret[2] = size; return ret; diff --git a/paddle/phi/backends/gpu/musa/rocm_device_function.h b/paddle/phi/backends/gpu/musa/rocm_device_function.h deleted file mode 100644 index 6f5d684075f0f..0000000000000 --- a/paddle/phi/backends/gpu/musa/rocm_device_function.h +++ /dev/null @@ -1,165 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -// NOTE(): support float16 to half in header file. -#define PADDLE_CUDA_FP16 -#include "paddle/phi/common/bfloat16.h" -#include "paddle/phi/common/complex.h" -#include "paddle/phi/common/float16.h" - -namespace phi { -namespace backends { -namespace gpu { - -#define CREATE_SHFL_MASK(mask, predicate) mask = __ballot((predicate)) - -#define CUDA_LAUNCH_KERNEL_BASE(dim, ...) \ - case (dim): { \ - constexpr auto kPowerOfTwoDim = (dim); \ - __VA_ARGS__; \ - } break - -#define CUDA_LAUNCH_KERNEL_HELPER(...) \ - CUDA_LAUNCH_KERNEL_BASE(1024, ##__VA_ARGS__); \ - CUDA_LAUNCH_KERNEL_BASE(512, ##__VA_ARGS__); \ - CUDA_LAUNCH_KERNEL_BASE(256, ##__VA_ARGS__); \ - CUDA_LAUNCH_KERNEL_BASE(128, ##__VA_ARGS__); \ - CUDA_LAUNCH_KERNEL_BASE(64, ##__VA_ARGS__); \ - CUDA_LAUNCH_KERNEL_BASE(32, ##__VA_ARGS__); - -template -__forceinline__ __device__ T -CudaShuffleDownSync(unsigned mask, T val, int delta, int width = warpSize) { - return __shfl_down(val, delta, width); -} - -template -__forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, - T val, - int width = warpSize) { - return __shfl_xor(val, width); -} - -template <> -__forceinline__ __device__ phi::dtype::float16 CudaShuffleDownSync( - unsigned mask, phi::dtype::float16 val, int delta, int width) { - return phi::dtype::float16(__shfl_down( - static_cast(val), static_cast(delta), width)); -} - -template <> -__forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleDownSync( - unsigned mask, phi::dtype::bfloat16 val, int delta, int width) { - return phi::dtype::bfloat16(__shfl_down( - static_cast(val), static_cast(delta), width)); -} - -template <> -__forceinline__ __device__ phi::dtype::complex CudaShuffleDownSync( - unsigned mask, phi::dtype::complex val, int delta, int width) { - float real = __shfl_down(val.real, delta, width); - float imag = __shfl_down(val.imag, delta, width); - return phi::dtype::complex(real, imag); -} - -template <> -__forceinline__ __device__ phi::dtype::complex CudaShuffleDownSync( - unsigned mask, phi::dtype::complex val, int delta, int width) { - double real = __shfl_down(val.real, delta, width); - double imag = __shfl_down(val.imag, delta, width); - return phi::dtype::complex(real, imag); -} - -template <> -__forceinline__ __device__ phi::dtype::float16 CudaShuffleXorSync( - unsigned mask, phi::dtype::float16 val, int width) { - return phi::dtype::float16(__shfl_xor(static_cast(val), width)); -} - -template <> -__forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleXorSync( - unsigned mask, phi::dtype::bfloat16 val, int width) { - return phi::dtype::bfloat16(__shfl_xor(static_cast(val), width)); -} - -template <> -__forceinline__ __device__ phi::dtype::complex CudaShuffleXorSync( - unsigned mask, phi::dtype::complex val, int width) { - float real = __shfl_xor(val.real, width); - float imag = __shfl_xor(val.imag, width); - return phi::dtype::complex(real, imag); -} - -template <> -__forceinline__ __device__ phi::dtype::complex CudaShuffleXorSync( - unsigned mask, phi::dtype::complex val, int width) { - double real = __shfl_xor(val.real, width); - double imag = __shfl_xor(val.imag, width); - return phi::dtype::complex(real, imag); -} - -template -__forceinline__ __device__ T -CudaShuffleSync(unsigned mask, T val, int src_line, int width = 32) { - return __shfl(val, src_line, width); -} - -template -HOSTDEVICE T Infinity() { - return INFINITY; -} - -template -__device__ T reduceSum(T val, int tid, int len) { - // NOTE(zcd): The warp size should be taken from the - // parameters of the GPU but not specified as 32 simply. - // To make the reduceSum more efficiently, - // I use Warp-Level Parallelism and assume the Warp size - // is 32 which may be different for different GPU, - // but most card's warp size is 32. -#ifdef PADDLE_WITH_HIP - const int warpSize = 64; -#else - const int warpSize = 32; -#endif - __shared__ T shm[warpSize]; - unsigned mask = 0u; - CREATE_SHFL_MASK(mask, tid < len); - - for (int offset = warpSize / 2; offset > 0; offset /= 2) - val += phi::backends::gpu::CudaShuffleDownSync(mask, val, offset); - - if (tid < warpSize) shm[tid] = 0; - __syncthreads(); - - if (tid % warpSize == 0) { - shm[tid / warpSize] = val; - } - __syncthreads(); - - CREATE_SHFL_MASK(mask, tid < warpSize); - - if (tid < warpSize) { - val = shm[tid]; - for (int offset = warpSize / 2; offset > 0; offset /= 2) - val += phi::backends::gpu::CudaShuffleDownSync(mask, val, offset); - } - return val; -} - -} // namespace gpu -} // namespace backends -} // namespace phi diff --git a/paddle/phi/backends/gpu/musa/rocm_helper.h b/paddle/phi/backends/gpu/musa/rocm_helper.h deleted file mode 100644 index 07fdde5a2f417..0000000000000 --- a/paddle/phi/backends/gpu/musa/rocm_helper.h +++ /dev/null @@ -1,74 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -namespace phi { -namespace backends { -namespace gpu { - -/* - * Summary: Grid stride looping macro in CUDA kernel - * - * [ Why need this macro? ] - * - * The original looping in CUDA kernel is: - * - * `for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ - * i += blockDim.x * gridDim.x)` - * - * This for condition is risky. The value of `blockIdx.x * blockDim.x` - * may be large, such as over 1GB, the first iteration is no problem here, - * but when `i += blockDim.x * gridDim.x` is executed, the value of i - * will greater than INT_MAX and overflow becomes negative value, at - * this time, the cycle condition `i < (n)` is still satisfied, so it - * will cause illegal access to cuda memory. - * - * Here is a real example in ERINE, it will trigger above error. - * The related data are: - * - blockIdx.x = 2172938 - * - blockDim.x = 512 - * - blockIdx.x * blockDim.x = 1112543864 - * - INT_MAX = 2147483647 - * - * So we polish the for condition as follow, the int64_t __index__ will - * prevent overflow in the loop increment. - * - * Parameters: - * - i: loop index - * - num: total element numbers - * - * Examples: - * template - * __global__ void Scale(T* logit_grad, const T* loss_grad, const int num, - * const int d, const int remain) { - * CUDA_KERNEL_LOOP(index, num) { - * int idx_n = index / d; - * int idx_remain = index % remain; - * logit_grad[index] *= loss_grad[idx_n * remain + idx_remain]; - * } - * } - * - */ - -#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \ - int64_t __index__ = \ - static_cast(hipBlockIdx_x) * hipBlockDim_x + hipThreadIdx_x; \ - int64_t __stride__ = static_cast(hipBlockDim_x) * hipGridDim_x; \ - for (index_type i = __index__; __index__ < (num); \ - __index__ += __stride__, i = __index__) - -} // namespace gpu -} // namespace backends -} // namespace phi diff --git a/paddle/phi/backends/gpu/musa/rocm_info.cc b/paddle/phi/backends/gpu/musa/rocm_info.cc deleted file mode 100644 index 32c7c329253b1..0000000000000 --- a/paddle/phi/backends/gpu/musa/rocm_info.cc +++ /dev/null @@ -1,334 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include - -#include "paddle/phi/backends/gpu/gpu_info.h" - -#include "paddle/phi/core/enforce.h" - -static std::once_flag g_device_props_size_init_flag; -static std::vector> g_device_props_init_flags; -static std::vector g_device_props; - -namespace phi { -namespace backends { -namespace gpu { - -int DnnVersion() { - if (!dynload::HasCUDNN()) return -1; - size_t version_major, version_minor, version_patch; - PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenGetVersion( - &version_major, &version_minor, &version_patch)); - return version_major * 100 + version_minor * 10 + version_patch; -} - -static int GetGPUDeviceCountImpl() { - int driverVersion = 0; - musaError_t status = musaDriverGetVersion(&driverVersion); - - if (!(status == gpuSuccess && driverVersion != 0)) { - // No GPU driver - VLOG(2) << "GPU Driver Version can't be detected. No GPU driver!"; - return 0; - } - - const auto *cuda_visible_devices = std::getenv("MUSA_VISIBLE_DEVICES"); - - if (cuda_visible_devices != nullptr) { - std::string cuda_visible_devices_str(cuda_visible_devices); - if (!cuda_visible_devices_str.empty()) { - cuda_visible_devices_str.erase( - 0, cuda_visible_devices_str.find_first_not_of('\'')); - cuda_visible_devices_str.erase( - cuda_visible_devices_str.find_last_not_of('\'') + 1); - cuda_visible_devices_str.erase( - 0, cuda_visible_devices_str.find_first_not_of('\"')); - cuda_visible_devices_str.erase( - cuda_visible_devices_str.find_last_not_of('\"') + 1); - } - if (std::all_of(cuda_visible_devices_str.begin(), - cuda_visible_devices_str.end(), - [](char ch) { return ch == ' '; })) { - VLOG(2) << "MUSA_VISIBLE_DEVICES is set to be " - "empty. No GPU detected."; - return 0; - } - } - int count; - PADDLE_ENFORCE_GPU_SUCCESS(musaGetDeviceCount(&count)); - return count; -} - -int GetGPUDeviceCount() { - // cache the count - static auto dev_cnt = GetGPUDeviceCountImpl(); - return dev_cnt; -} - -int GetGPUComputeCapability(int id) { - PADDLE_ENFORCE_LT( - id, - GetGPUDeviceCount(), - phi::errors::InvalidArgument("Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, - GetGPUDeviceCount())); - int major, minor; - auto major_error_code = musaDeviceGetAttribute( - &major, musaDeviceAttributeComputeCapabilityMajor, id); - auto minor_error_code = musaDeviceGetAttribute( - &minor, musaDeviceAttributeComputeCapabilityMinor, id); - - PADDLE_ENFORCE_GPU_SUCCESS(major_error_code); - PADDLE_ENFORCE_GPU_SUCCESS(minor_error_code); - return major * 100 + minor; -} - -int GetGPURuntimeVersion(int id) { - PADDLE_ENFORCE_LT( - id, - GetGPUDeviceCount(), - phi::errors::InvalidArgument("Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, - GetGPUDeviceCount())); - int runtime_version = 0; - PADDLE_ENFORCE_GPU_SUCCESS(musaRuntimeGetVersion(&runtime_version)); - return runtime_version; -} - -int GetGPUDriverVersion(int id) { - PADDLE_ENFORCE_LT( - id, - GetGPUDeviceCount(), - phi::errors::InvalidArgument("Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, - GetGPUDeviceCount())); - int driver_version = 0; - PADDLE_ENFORCE_GPU_SUCCESS(musaDriverGetVersion(&driver_version)); - return driver_version; -} - -bool TensorCoreAvailable() { return false; } - -int GetGPUMultiProcessors(int id) { - PADDLE_ENFORCE_LT( - id, - GetGPUDeviceCount(), - phi::errors::InvalidArgument("Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, - GetGPUDeviceCount())); - int count; - PADDLE_ENFORCE_GPU_SUCCESS( - musaDeviceGetAttribute(&count, musaDeviceAttributeMultiprocessorCount, id)); - return count; -} - -int GetGPUMaxThreadsPerMultiProcessor(int id) { - PADDLE_ENFORCE_LT( - id, - GetGPUDeviceCount(), - phi::errors::InvalidArgument("Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, - GetGPUDeviceCount())); - int count; - PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceGetAttribute( - &count, musaDeviceAttributeMaxThreadsPerMultiProcessor, id)); - - return count; -} - -int GetGPUMaxThreadsPerBlock(int id) { - PADDLE_ENFORCE_LT( - id, - GetGPUDeviceCount(), - phi::errors::InvalidArgument("Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, - GetGPUDeviceCount())); - int count; - PADDLE_ENFORCE_GPU_SUCCESS( - musaDeviceGetAttribute(&count, musaDeviceAttributeMaxThreadsPerBlock, id)); - return count; -} - -int GetCurrentDeviceId() { - int device_id; - PADDLE_ENFORCE_GPU_SUCCESS(musaGetDevice(&device_id)); - return device_id; -} - -std::array GetGpuMaxGridDimSize(int id) { - PADDLE_ENFORCE_LT( - id, - GetGPUDeviceCount(), - phi::errors::InvalidArgument("Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, - GetGPUDeviceCount())); - std::array ret; - int size; - auto error_code_x = - musaDeviceGetAttribute(&size, musaDeviceAttributeMaxGridDimX, id); - PADDLE_ENFORCE_GPU_SUCCESS(error_code_x); - ret[0] = size; - - auto error_code_y = - musaDeviceGetAttribute(&size, musaDeviceAttributeMaxGridDimY, id); - PADDLE_ENFORCE_GPU_SUCCESS(error_code_y); - ret[1] = size; - - auto error_code_z = - musaDeviceGetAttribute(&size, musaDeviceAttributeMaxGridDimZ, id); - PADDLE_ENFORCE_GPU_SUCCESS(error_code_z); - ret[2] = size; - return ret; -} - -std::pair GetGpuStreamPriorityRange() { - int least_priority, greatest_priority; - PADDLE_ENFORCE_GPU_SUCCESS( - musaDeviceGetStreamPriorityRange(&least_priority, &greatest_priority)); - return std::make_pair(least_priority, greatest_priority); -} - -const gpuDeviceProp &GetDeviceProperties(int id) { - std::call_once(g_device_props_size_init_flag, [&] { - int gpu_num = 0; - gpu_num = GetGPUDeviceCount(); - g_device_props_init_flags.resize(gpu_num); - g_device_props.resize(gpu_num); - for (int i = 0; i < gpu_num; ++i) { - g_device_props_init_flags[i] = std::make_unique(); - } - }); - - if (id == -1) { - id = GetCurrentDeviceId(); - } - - if (id < 0 || id >= static_cast(g_device_props.size())) { - PADDLE_THROW(phi::errors::OutOfRange( - "The device id %d is out of range [0, %d), where %d is the number of " - "devices on this machine. Because the device id should be greater than " - "or equal to zero and smaller than the number of gpus. Please input " - "appropriate device again!", - id, - static_cast(g_device_props.size()), - static_cast(g_device_props.size()))); - } - - std::call_once(*(g_device_props_init_flags[id]), [&] { - PADDLE_ENFORCE_GPU_SUCCESS(musaGetDeviceProperties(&g_device_props[id], id)); - }); - - return g_device_props[id]; -} - -void SetDeviceId(int id) { - // TODO(qijun): find a better way to cache the cuda device count - PADDLE_ENFORCE_LT( - id, - GetGPUDeviceCount(), - phi::errors::InvalidArgument("Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, - GetGPUDeviceCount())); - PADDLE_RETRY_CUDA_SUCCESS(musaSetDevice(id)); -} - -void GpuMemcpyAsync(void *dst, - const void *src, - size_t count, - gpuMemcpyKind kind, - gpuStream_t stream) { - PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpyAsync(dst, src, count, kind, stream)); -} - -void GpuMemcpySync(void *dst, - const void *src, - size_t count, - gpuMemcpyKind kind) { - PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpy(dst, src, count, kind)); -} - -void GpuMemcpyPeerAsync(void *dst, - int dst_device, - const void *src, - int src_device, - size_t count, - gpuStream_t stream) { - PADDLE_ENFORCE_GPU_SUCCESS( - musaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream)); -} - -void GpuMemcpyPeerSync( - void *dst, int dst_device, const void *src, int src_device, size_t count) { - PADDLE_ENFORCE_GPU_SUCCESS( - musaMemcpyPeer(dst, dst_device, src, src_device, count)); -} - -void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream) { - PADDLE_ENFORCE_GPU_SUCCESS(musaMemsetAsync(dst, value, count, stream)); -} - -void GpuStreamSync(gpuStream_t stream) { - PADDLE_ENFORCE_GPU_SUCCESS(musaStreamSynchronize(stream)); -} - -void GpuDestroyStream(gpuStream_t stream) { - PADDLE_ENFORCE_GPU_SUCCESS(musaStreamDestroy(stream)); -} - -void GpuDeviceSync() { PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize()); } - -gpuError_t GpuGetLastError() { return musaGetLastError(); } - -bool IsGPUManagedMemorySupported(int dev_id) { - PADDLE_ENFORCE_LT( - dev_id, - GetGPUDeviceCount(), - phi::errors::InvalidArgument("Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - dev_id, - GetGPUDeviceCount())); - // TODO(qili93): Hygon DTK (21.04 and 22.04) not support - // musaDeviceAttributeManagedMemory, temporary disable by default, to be - // verified in next DTK release - return false; -} - -bool IsGPUManagedMemoryOversubscriptionSupported(int dev_id) { - PADDLE_ENFORCE_LT( - dev_id, - GetGPUDeviceCount(), - phi::errors::InvalidArgument("Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - dev_id, - GetGPUDeviceCount())); -#ifdef __linux__ - return IsGPUManagedMemorySupported(dev_id) && - GetGPUComputeCapability(dev_id) >= 60; -#else - return false; -#endif -} - -} // namespace gpu -} // namespace backends -} // namespace phi From 342b7385f381af2addab2ec541f52af9b7b7437b Mon Sep 17 00:00:00 2001 From: "yiyuan.zhou" Date: Tue, 25 Jul 2023 18:27:19 +0800 Subject: [PATCH 05/55] add musa_stream --- paddle/fluid/pybind/cuda_streams_py.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/paddle/fluid/pybind/cuda_streams_py.cc b/paddle/fluid/pybind/cuda_streams_py.cc index da6dee7657c09..de97b39218157 100644 --- a/paddle/fluid/pybind/cuda_streams_py.cc +++ b/paddle/fluid/pybind/cuda_streams_py.cc @@ -84,6 +84,8 @@ void BindCudaStream(py::module *m_ptr) { paddle::platform::SetDeviceId(device_id); #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize()); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); #endif From 67d65fd1d8819d3f18c493a3a0e3d425aeecf583 Mon Sep 17 00:00:00 2001 From: CaiZhi Date: Mon, 24 Jul 2023 12:12:15 +0000 Subject: [PATCH 06/55] [MTAI] build(system): enable build system in paddle for MUSA --- .../operators/fused/fused_seqpool_cvm_op.cu | 63 ++++++++++++++++++- .../operators/fused/multihead_matmul_op.cu | 4 +- .../fluid/operators/fused/yolo_box_post_op.cu | 61 ++++++++++++++---- .../operators/math/bert_encoder_functor.h | 6 ++ paddle/fluid/operators/math/sample_prob.cu | 7 ++- .../optimizers/distributed_fused_lamb_op.cu | 29 +++++++-- .../fluid/operators/reader/buffered_reader.cc | 7 ++- paddle/fluid/platform/collective_helper.cc | 2 +- paddle/fluid/platform/device/gpu/gpu_helper.h | 4 +- paddle/fluid/platform/device/gpu/gpu_info.cc | 8 +-- .../platform/device/gpu/gpu_resource_pool.cc | 10 +++ .../platform/device/gpu/gpu_resource_pool.h | 5 ++ paddle/fluid/platform/device/gpu/gpu_types.h | 4 +- paddle/fluid/platform/enforce.h | 10 +++ paddle/fluid/platform/event.h | 3 + paddle/fluid/platform/profiler.cu | 20 +++++- paddle/fluid/platform/profiler/profiler.cc | 6 ++ paddle/fluid/platform/profiler/utils.cc | 2 + paddle/fluid/platform/profiler/utils.h | 2 + paddle/fluid/platform/profiler_helper.h | 12 ++++ .../fluid/platform/stream_callback_manager.h | 5 ++ paddle/fluid/pybind/cuda_streams_py.cc | 2 + paddle/fluid/pybind/tensor_py.h | 3 + paddle/phi/core/cuda_stream.h | 5 -- 24 files changed, 246 insertions(+), 34 deletions(-) diff --git a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu index 362860aa23bdf..f038190e72927 100644 --- a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu +++ b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu @@ -122,7 +122,7 @@ void FusedSeqpoolCVM(const framework::ExecutionContext memory::AllocShared(ctx.GetPlace(), total_ptr_len * sizeof(void *)); void *ptr = temp_ptr->ptr(); -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) T **gpu_input_values = reinterpret_cast(temp_ptr->ptr()); platform::GpuMemcpyAsync(gpu_input_values, input_data.data(), @@ -150,6 +150,34 @@ void FusedSeqpoolCVM(const framework::ExecutionContext lods.size() * sizeof(size_t *), hipMemcpyHostToDevice, stream); +#elif defined(PADDLE_WITH_MUSA) + T **gpu_input_values = reinterpret_cast(temp_ptr->ptr()); + platform::GpuMemcpyAsync(gpu_input_values, + input_data.data(), + input_data.size() * sizeof(T *), + musaMemcpyHostToDevice, + stream); + T **gpu_output_values = + reinterpret_cast(&gpu_input_values[input_data.size()]); + platform::GpuMemcpyAsync(gpu_output_values, + output_data.data(), + output_data.size() * sizeof(T *), + musaMemcpyHostToDevice, + stream); + T **gpu_seqpool_output_values = + reinterpret_cast(&gpu_output_values[output_data.size()]); + platform::GpuMemcpyAsync(gpu_seqpool_output_values, + seqpool_output_data.data(), + seqpool_output_data.size() * sizeof(T *), + musaMemcpyHostToDevice, + stream); + size_t **lods_values = reinterpret_cast( + &gpu_seqpool_output_values[seqpool_output_data.size()]); + platform::GpuMemcpyAsync(lods_values, + lods.data(), + lods.size() * sizeof(size_t *), + musaMemcpyHostToDevice, + stream); #else T **gpu_input_values = reinterpret_cast(temp_ptr->ptr()); platform::GpuMemcpyAsync(gpu_input_values, @@ -325,7 +353,7 @@ void FusedSeqpoolCVMGrad(const framework::ExecutionContext &ctx, cvm_data.size() + lods.size(); auto temp_ptr = memory::AllocShared(ctx.GetPlace(), total_ptr_len * sizeof(void *)); -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) T **gpu_out_grads_values = reinterpret_cast(temp_ptr->ptr()); platform::GpuMemcpyAsync(gpu_out_grads_values, out_grads_data.data(), @@ -356,6 +384,37 @@ void FusedSeqpoolCVMGrad(const framework::ExecutionContext &ctx, lods.size() * sizeof(size_t *), hipMemcpyHostToDevice, stream); +#elif defined(PADDLE_WITH_MUSA) + T **gpu_out_grads_values = reinterpret_cast(temp_ptr->ptr()); + platform::GpuMemcpyAsync(gpu_out_grads_values, + out_grads_data.data(), + out_grads_data.size() * sizeof(T *), + musaMemcpyHostToDevice, + stream); + + T **gpu_in_grads_values = + reinterpret_cast(&gpu_out_grads_values[out_grads_data.size()]); + platform::GpuMemcpyAsync(gpu_in_grads_values, + in_grads_data.data(), + in_grads_data.size() * sizeof(T *), + musaMemcpyHostToDevice, + stream); + + T **gpu_cvm_values = + reinterpret_cast(&gpu_in_grads_values[in_grads_data.size()]); + platform::GpuMemcpyAsync(gpu_cvm_values, + cvm_data.data(), + cvm_data.size() * sizeof(T *), + musaMemcpyHostToDevice, + stream); + + size_t **lods_values = + reinterpret_cast(&gpu_cvm_values[cvm_data.size()]); + platform::GpuMemcpyAsync(lods_values, + lods.data(), + lods.size() * sizeof(size_t *), + musaMemcpyHostToDevice, + stream); #else T **gpu_out_grads_values = reinterpret_cast(temp_ptr->ptr()); platform::GpuMemcpyAsync(gpu_out_grads_values, diff --git a/paddle/fluid/operators/fused/multihead_matmul_op.cu b/paddle/fluid/operators/fused/multihead_matmul_op.cu index 8402bc78ef64c..36d0de8c6c9d1 100644 --- a/paddle/fluid/operators/fused/multihead_matmul_op.cu +++ b/paddle/fluid/operators/fused/multihead_matmul_op.cu @@ -327,8 +327,10 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel { temp_bias_tensor.Resize({size}); auto *temp_qk_bias = device_ctx.template Alloc( &temp_bias_tensor, temp_bias_tensor.numel() * sizeof(T)); -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) hipMemset(temp_qk_bias, 0, sizeof(float) * size); +#elif defined(PADDLE_WITH_MUSA) + musaMemset(temp_qk_bias, 0, sizeof(float) * size); #else cudaMemset(temp_qk_bias, 0, sizeof(float) * size); #endif diff --git a/paddle/fluid/operators/fused/yolo_box_post_op.cu b/paddle/fluid/operators/fused/yolo_box_post_op.cu index 72bb97a2aae9e..6b8874d289c77 100644 --- a/paddle/fluid/operators/fused/yolo_box_post_op.cu +++ b/paddle/fluid/operators/fused/yolo_box_post_op.cu @@ -252,9 +252,12 @@ static void YoloTensorParseCuda( // Estimate how many boxes will be choosed int bbox_count = 0; -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) hipMemcpy( bbox_count_device_ptr, &bbox_count, sizeof(int), hipMemcpyHostToDevice); +#elif defined(PADDLE_WITH_MUSA) + musaMemcpy( + bbox_count_device_ptr, &bbox_count, sizeof(int), musaMemcpyHostToDevice); #else cudaMemcpy( bbox_count_device_ptr, &bbox_count, sizeof(int), cudaMemcpyHostToDevice); @@ -265,9 +268,12 @@ static void YoloTensorParseCuda( class_num, anchors_num, prob_thresh); -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) hipMemcpy( &bbox_count, bbox_count_device_ptr, sizeof(int), hipMemcpyDeviceToHost); +#elif defined(PADDLE_WITH_MUSA) + musaMemcpy( + &bbox_count, bbox_count_device_ptr, sizeof(int), musaMemcpyDeviceToHost); #else cudaMemcpy( &bbox_count, bbox_count_device_ptr, sizeof(int), cudaMemcpyDeviceToHost); @@ -280,9 +286,12 @@ static void YoloTensorParseCuda( float* bbox_tensor = *bboxes_tensor_ptr; // Update previous maximum bbox number if (bbox_count > *bbox_count_max_alloc) { -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) hipFree(bbox_tensor); hipMalloc(&bbox_tensor, bbox_count * (5 + class_num) * sizeof(float)); +#elif defined(PADDLE_WITH_MUSA) + musaFree(bbox_tensor); + musaMalloc(&bbox_tensor, bbox_count * (5 + class_num) * sizeof(float)); #else cudaFree(bbox_tensor); cudaMalloc(&bbox_tensor, bbox_count * (5 + class_num) * sizeof(float)); @@ -293,9 +302,12 @@ static void YoloTensorParseCuda( // Now generate bboxes int bbox_index = 0; -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) hipMemcpy( bbox_index_device_ptr, &bbox_index, sizeof(int), hipMemcpyHostToDevice); +#elif defined(PADDLE_WITH_MUSA) + musaMemcpy( + bbox_index_device_ptr, &bbox_index, sizeof(int), musaMemcpyHostToDevice); #else cudaMemcpy( bbox_index_device_ptr, &bbox_index, sizeof(int), cudaMemcpyHostToDevice); @@ -349,13 +361,20 @@ class YoloBoxPostKernel : public framework::OpKernel { anchors.insert(anchors.end(), anchors1.begin(), anchors1.end()); anchors.insert(anchors.end(), anchors2.begin(), anchors2.end()); int* device_anchors; -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) hipMalloc(reinterpret_cast(&device_anchors), anchors.size() * sizeof(int)); hipMemcpy(device_anchors, anchors.data(), anchors.size() * sizeof(int), hipMemcpyHostToDevice); +#elif defined(PADDLE_WITH_MUSA) + musaMalloc(reinterpret_cast(&device_anchors), + anchors.size() * sizeof(int)); + musaMemcpy(device_anchors, + anchors.data(), + anchors.size() * sizeof(int), + musaMemcpyHostToDevice); #else cudaMalloc(reinterpret_cast(&device_anchors), anchors.size() * sizeof(int)); @@ -384,10 +403,14 @@ class YoloBoxPostKernel : public framework::OpKernel { int batch = context.Input("ImageShape")->dims()[0]; TensorInfo* ts_info = new TensorInfo[batch * boxes_input.size()]; for (int i = 0; i < batch * static_cast(boxes_input.size()); i++) { -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) hipMalloc( reinterpret_cast(&ts_info[i].bboxes_dev_ptr), ts_info[i].bbox_count_max_alloc * (5 + class_num) * sizeof(float)); +#elif defined(PADDLE_WITH_MUSA) + musaMalloc( + reinterpret_cast(&ts_info[i].bboxes_dev_ptr), + ts_info[i].bbox_count_max_alloc * (5 + class_num) * sizeof(float)); #else cudaMalloc( reinterpret_cast(&ts_info[i].bboxes_dev_ptr), @@ -395,9 +418,12 @@ class YoloBoxPostKernel : public framework::OpKernel { #endif ts_info[i].bboxes_host_ptr = reinterpret_cast(malloc( ts_info[i].bbox_count_max_alloc * (5 + class_num) * sizeof(float))); -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) hipMalloc(reinterpret_cast(&ts_info[i].bbox_count_device_ptr), sizeof(int)); +#elif defined(PADDLE_WITH_MUSA) + musaMalloc(reinterpret_cast(&ts_info[i].bbox_count_device_ptr), + sizeof(int)); #else cudaMalloc(reinterpret_cast(&ts_info[i].bbox_count_device_ptr), sizeof(int)); @@ -407,8 +433,10 @@ class YoloBoxPostKernel : public framework::OpKernel { // Box index counter in gpu memory // *bbox_index_device_ptr used by atomicAdd int* bbox_index_device_ptr; -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) hipMalloc(reinterpret_cast(&bbox_index_device_ptr), sizeof(int)); +#elif defined(PADDLE_WITH_MUSA) + musaMalloc(reinterpret_cast(&bbox_index_device_ptr), sizeof(int)); #else cudaMalloc(reinterpret_cast(&bbox_index_device_ptr), sizeof(int)); #endif @@ -450,12 +478,18 @@ class YoloBoxPostKernel : public framework::OpKernel { bbox_count_max_alloc * (5 + class_num) * sizeof(float))); } // we need copy bbox_count_host boxes to cpu memory -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) hipMemcpyAsync( ts_info[ts_id].bboxes_host_ptr, ts_info[ts_id].bboxes_dev_ptr, ts_info[ts_id].bbox_count_host * (5 + class_num) * sizeof(float), hipMemcpyDeviceToHost); +#elif defined(PADDLE_WITH_MUSA) + musaMemcpyAsync( + ts_info[ts_id].bboxes_host_ptr, + ts_info[ts_id].bboxes_dev_ptr, + ts_info[ts_id].bbox_count_host * (5 + class_num) * sizeof(float), + musaMemcpyDeviceToHost); #else cudaMemcpyAsync( ts_info[ts_id].bboxes_host_ptr, @@ -532,15 +566,20 @@ class YoloBoxPostKernel : public framework::OpKernel { boxes_num_data[batch_id] = bbox_det_vec.size(); } -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) hipFree(bbox_index_device_ptr); +#elif defined(PADDLE_WITH_MUSA) + musaFree(bbox_index_device_ptr); #else cudaFree(bbox_index_device_ptr); #endif for (int i = 0; i < batch * boxes_input.size(); i++) { -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) hipFree(ts_info[i].bboxes_dev_ptr); hipFree(ts_info[i].bbox_count_device_ptr); +#elif defined(PADDLE_WITH_MUSA) + musaFree(ts_info[i].bboxes_dev_ptr); + musaFree(ts_info[i].bbox_count_device_ptr); #else cudaFree(ts_info[i].bboxes_dev_ptr); cudaFree(ts_info[i].bbox_count_device_ptr); diff --git a/paddle/fluid/operators/math/bert_encoder_functor.h b/paddle/fluid/operators/math/bert_encoder_functor.h index a9869e5faecce..e5adc97fa7890 100644 --- a/paddle/fluid/operators/math/bert_encoder_functor.h +++ b/paddle/fluid/operators/math/bert_encoder_functor.h @@ -18,6 +18,12 @@ limitations under the License. */ #include #include +#include // NOLINT +#endif +#ifdef PADDLE_WITH_MUSA +#include +#include + #include // NOLINT #endif #ifdef PADDLE_WITH_HIP diff --git a/paddle/fluid/operators/math/sample_prob.cu b/paddle/fluid/operators/math/sample_prob.cu index 0c6b49729546c..4aa38e7441917 100644 --- a/paddle/fluid/operators/math/sample_prob.cu +++ b/paddle/fluid/operators/math/sample_prob.cu @@ -155,11 +155,16 @@ void GPUSampleWithProb::operator()(const phi::GPUContext& context, int num_tries = UniqSampler(sampler, num_samples, s_data); VLOG(1) << "num_tries: " << num_tries; -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpy(samples_data + num_true, s_data, sizeof(int64_t) * num_samples, hipMemcpyHostToDevice)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpy(samples_data + num_true, + s_data, + sizeof(int64_t) * num_samples, + hipMemcpyHostToDevice)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(samples_data + num_true, s_data, diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu index cad7e38ba1c1a..ba520f026bf7a 100644 --- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu +++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu @@ -32,6 +32,11 @@ #include "cub/cub.cuh" #include "math.h" // NOLINT #endif +#ifdef __MUSACC__ +#include "cub/cub.cuh" +#include "math.h" // NOLINT +#endif + #ifdef __HIPCC__ #include @@ -51,8 +56,10 @@ using phi::funcs::ToVector; template static void FillZeroWithPtr(T *x, size_t n, gpuStream_t stream) { static_assert(!std::is_same::value, "T cannot be void."); -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_GPU_SUCCESS(hipMemsetAsync(x, 0, n * sizeof(T), stream)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaMemsetAsync(x, 0, n * sizeof(T), stream)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(x, 0, n * sizeof(T), stream)); #endif @@ -250,10 +257,14 @@ static void LogParamAndTrustRatioDivSquareNorm( static bool IsFinite(const phi::GPUContext &dev_ctx, const float *ptr) { auto stream = dev_ctx.stream(); float cpu_value; -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync( &cpu_value, ptr, sizeof(float), hipMemcpyDeviceToHost, stream)); PADDLE_ENFORCE_GPU_SUCCESS(hipStreamSynchronize(stream)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpyAsync( + &cpu_value, ptr, sizeof(float), musaMemcpyDeviceToHost, stream)); + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamSynchronize(stream)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync( &cpu_value, ptr, sizeof(float), cudaMemcpyDeviceToHost, stream)); @@ -1129,10 +1140,14 @@ static std::string GetMinMaxStr(const T *x, size_t n, const phi::Place &place) { stream, &cub_buffer); T ret_cpu[2]; -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync( &ret_cpu[0], ret, 2 * sizeof(T), hipMemcpyDeviceToHost, stream)); PADDLE_ENFORCE_GPU_SUCCESS(hipStreamSynchronize(stream)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpyAsync( + &ret_cpu[0], ret, 2 * sizeof(T), musaMemcpyDeviceToHost, stream)); + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamSynchronize(stream)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync( &ret_cpu[0], ret, 2 * sizeof(T), cudaMemcpyDeviceToHost, stream)); @@ -1183,12 +1198,18 @@ static bool HasNanInf(const phi::GPUContext &dev_ctx, const T *x, int numel) { dev_ctx.stream(), &buffer); bool flag; -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(&flag, out.Get(), sizeof(flag), hipMemcpyDeviceToHost, dev_ctx.stream())); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpyAsync(&flag, + out.Get(), + sizeof(flag), + musaMemcpyDeviceToHost, + dev_ctx.stream())); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(&flag, out.Get(), diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc index 73b3823d3e5ab..8255acecb3707 100644 --- a/paddle/fluid/operators/reader/buffered_reader.cc +++ b/paddle/fluid/operators/reader/buffered_reader.cc @@ -192,11 +192,16 @@ void BufferedReader::ReadAsync(size_t i) { // cuda[i].mutable_data() is called, since some ops release // cuda memory immediately without waiting cuda kernel ends platform::SetDeviceId(place_.device); -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_GPU_SUCCESS( hipEventRecord(events_[i].get(), compute_stream_)); PADDLE_ENFORCE_GPU_SUCCESS( hipStreamWaitEvent(stream_.get(), events_[i].get(), 0)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaEventRecord(events_[i].get(), compute_stream_)); + PADDLE_ENFORCE_GPU_SUCCESS( + musaStreamWaitEvent(stream_.get(), events_[i].get(), 0)); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaEventRecord(events_[i].get(), compute_stream_)); diff --git a/paddle/fluid/platform/collective_helper.cc b/paddle/fluid/platform/collective_helper.cc index a6c2b9d61dd2b..941cd49cd361d 100644 --- a/paddle/fluid/platform/collective_helper.cc +++ b/paddle/fluid/platform/collective_helper.cc @@ -172,7 +172,7 @@ void NCCLCommContext::CreateNCCLCommMultiTrainer( { PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclGroupStart()); for (int i = 0; i < kDevices; i++) { -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_GPU_SUCCESS(hipSetDevice(i)); #elif defined(PADDLE_WITH_MUSA) PADDLE_ENFORCE_GPU_SUCCESS(musaSetDevice(i)); diff --git a/paddle/fluid/platform/device/gpu/gpu_helper.h b/paddle/fluid/platform/device/gpu/gpu_helper.h index 7fde4429bb7f3..ac096b94bed84 100644 --- a/paddle/fluid/platform/device/gpu/gpu_helper.h +++ b/paddle/fluid/platform/device/gpu/gpu_helper.h @@ -15,8 +15,10 @@ #pragma once #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/device/gpu/rocm/rocm_helper.h" +#elif defined(PADDLE_WITH_MUSA) +#include "paddle/fluid/platform/device/gpu/musa/musa_helper.h" #else #include "paddle/fluid/platform/device/gpu/cuda/cuda_helper.h" #include "paddle/fluid/platform/device/gpu/cuda/cusparse_helper.h" diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc index 73fe0ca05ba73..2959773e14737 100644 --- a/paddle/fluid/platform/device/gpu/gpu_info.cc +++ b/paddle/fluid/platform/device/gpu/gpu_info.cc @@ -33,7 +33,7 @@ limitations under the License. */ #include "paddle/fluid/string/split.h" #include "paddle/phi/backends/gpu/gpu_info.h" -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/dynload/miopen.h" #elif defined(PADDLE_WITH_MUSA) //TODO(Xiaokang Shang) @@ -212,7 +212,7 @@ class RecordedGpuMallocHelper { CUDADeviceGuard guard(dev_id_); gpuError_t result; -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) if (UNLIKELY(malloc_managed_memory)) { result = hipMallocManaged(ptr, size); } else { @@ -267,7 +267,7 @@ class RecordedGpuMallocHelper { // process is terminating, in which case we don't care if // cudaFree succeeds. CUDADeviceGuard guard(dev_id_); -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) auto err = hipFree(ptr); if (err != hipErrorDeinitialized) { #elif defined(PADDLE_WITH_MUSA) @@ -318,7 +318,7 @@ class RecordedGpuMallocHelper { size_t *actual_total) { { CUDADeviceGuard guard(dev_id_); -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) auto result = hipMemGetInfo(actual_avail, actual_total); #elif defined(PADDLE_WITH_MUSA) auto result = musaMemGetInfo(actual_avail, actual_total); diff --git a/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc b/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc index ee60040f09074..d8e9197bf6ea5 100644 --- a/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc +++ b/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc @@ -30,6 +30,9 @@ CudaStreamResourcePool::CudaStreamResourcePool() { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaStreamCreateWithFlags(&stream, musaStreamNonBlocking)); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); @@ -41,6 +44,8 @@ CudaStreamResourcePool::CudaStreamResourcePool() { platform::SetDeviceId(dev_idx); #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(stream)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamDestroy(stream)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(stream)); #endif @@ -82,6 +87,9 @@ CudaEventResourcePool::CudaEventResourcePool() { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipEventCreateWithFlags(&event, hipEventDisableTiming)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaEventCreateWithFlags(&event, musaEventDisableTiming)); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); @@ -93,6 +101,8 @@ CudaEventResourcePool::CudaEventResourcePool() { platform::SetDeviceId(dev_idx); #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(event)); #endif diff --git a/paddle/fluid/platform/device/gpu/gpu_resource_pool.h b/paddle/fluid/platform/device/gpu/gpu_resource_pool.h index ff1452153e7bd..8de12bba141c6 100644 --- a/paddle/fluid/platform/device/gpu/gpu_resource_pool.h +++ b/paddle/fluid/platform/device/gpu/gpu_resource_pool.h @@ -21,6 +21,11 @@ #include #endif +#ifdef PADDLE_WITH_MUSA +#include +#include +#endif + #ifdef PADDLE_WITH_HIP #include #endif diff --git a/paddle/fluid/platform/device/gpu/gpu_types.h b/paddle/fluid/platform/device/gpu/gpu_types.h index b3d4c7071c216..dac2add9f82c1 100644 --- a/paddle/fluid/platform/device/gpu/gpu_types.h +++ b/paddle/fluid/platform/device/gpu/gpu_types.h @@ -101,7 +101,7 @@ using CUDAGraphID = unsigned long long; // NOLINT #undef DECLARE_TYPE_FOR_GPU -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) #define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \ constexpr auto GPU_CV = ROCM_CV; #elif defined(PADDLE_WITH_MUSA) @@ -116,7 +116,7 @@ using CUDAGraphID = unsigned long long; // NOLINT DECLARE_CONSTANT_FOR_GPU(gpuErrorOutOfMemory, cudaErrorMemoryAllocation, hipErrorOutOfMemory, - musaErrorOutOfMemory); + musaErrorMemoryAllocation); DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady, cudaErrorNotReady, hipErrorNotReady, musaErrorNotReady); DECLARE_CONSTANT_FOR_GPU(gpuSuccess, cudaSuccess, hipSuccess, musaSuccess); diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 105c5f0607f69..160d6fb9912cb 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -38,6 +38,16 @@ limitations under the License. */ #include #endif // PADDLE_WITH_CUDA +#ifdef PADDLE_WITH_MUSA +#include +#include +#include +#include +#include +#include +#include +#endif // PADDLE_WITH_MUSA + #ifdef PADDLE_WITH_HIP #include #include diff --git a/paddle/fluid/platform/event.h b/paddle/fluid/platform/event.h index e807a54fdee2d..e1a40cb8f7f64 100644 --- a/paddle/fluid/platform/event.h +++ b/paddle/fluid/platform/event.h @@ -21,6 +21,9 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUDA #include #endif +#ifdef PADDLE_WITH_MUSA +#include +#endif #ifdef PADDLE_WITH_HIP #include #endif diff --git a/paddle/fluid/platform/profiler.cu b/paddle/fluid/platform/profiler.cu index 5d1caffd45326..1f9bacecfea4b 100644 --- a/paddle/fluid/platform/profiler.cu +++ b/paddle/fluid/platform/profiler.cu @@ -16,6 +16,10 @@ limitations under the License. */ #include #endif +#ifdef PADDLE_WITH_MUSA +#include +#endif + #ifdef PADDLE_WITH_HIP #include #endif @@ -38,7 +42,7 @@ static void ForEachDevice(std::function func) { } void DummyKernelAndEvent() { -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) for (int i = 0; i < 5; i++) { ForEachDevice([](int d) { platform::SetDeviceId(d); @@ -52,6 +56,20 @@ void DummyKernelAndEvent() { PADDLE_ENFORCE_GPU_SUCCESS(hipFree(ptr)); }); } +#elif defined(PADDLE_WITH_MUSA) + for (int i = 0; i < 5; i++) { + ForEachDevice([](int d) { + platform::SetDeviceId(d); + musaStream_t stream; + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamCreate(&stream)); + Mark("_cuda_startup_"); + int *ptr; + PADDLE_ENFORCE_GPU_SUCCESS(musaMalloc(&ptr, sizeof(int))); + DummyKernel<<<1, 1, 0, stream>>>(ptr); + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamSynchronize(stream)); + PADDLE_ENFORCE_GPU_SUCCESS(musaFree(ptr)); + }); + } #else for (int i = 0; i < 5; i++) { ForEachDevice([](int d) { diff --git a/paddle/fluid/platform/profiler/profiler.cc b/paddle/fluid/platform/profiler/profiler.cc index 8f34d5acc0bee..ca3211ba103aa 100644 --- a/paddle/fluid/platform/profiler/profiler.cc +++ b/paddle/fluid/platform/profiler/profiler.cc @@ -18,6 +18,9 @@ #ifdef PADDLE_WITH_CUDA #include #endif +#ifdef PADDLE_WITH_MUSA +#include +#endif #ifdef PADDLE_WITH_HIP #include #endif @@ -43,6 +46,9 @@ void SynchronizeDevice() { #ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); #endif +#ifdef PADDLE_WITH_MUSA + PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize()); +#endif #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); #endif diff --git a/paddle/fluid/platform/profiler/utils.cc b/paddle/fluid/platform/profiler/utils.cc index 7fb25b25577c4..a4fb29b86f43f 100644 --- a/paddle/fluid/platform/profiler/utils.cc +++ b/paddle/fluid/platform/profiler/utils.cc @@ -93,6 +93,8 @@ float CalculateEstOccupancy(uint32_t DeviceId, return occupancy; } +#elif defined(PADDLE_WITH_MUSA) + #else float CalculateEstOccupancy(uint32_t DeviceId, diff --git a/paddle/fluid/platform/profiler/utils.h b/paddle/fluid/platform/profiler/utils.h index c9437e0e7793a..5adaadf87d288 100644 --- a/paddle/fluid/platform/profiler/utils.h +++ b/paddle/fluid/platform/profiler/utils.h @@ -133,6 +133,8 @@ float CalculateEstOccupancy(uint32_t DeviceId, int32_t BlockZ, void* kernelFunc, uint8_t launchType); +#elif defined(PADDLE_WITH_MUSA) + #else float CalculateEstOccupancy(uint32_t deviceId, uint16_t registersPerThread, diff --git a/paddle/fluid/platform/profiler_helper.h b/paddle/fluid/platform/profiler_helper.h index 5dad7788d0b09..2fa0ece0f9883 100644 --- a/paddle/fluid/platform/profiler_helper.h +++ b/paddle/fluid/platform/profiler_helper.h @@ -31,6 +31,9 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUDA #include #endif // PADDLE_WITH_CUDA +#ifdef PADDLE_WITH_MUSA +#include +#endif // PADDLE_WITH_MUSA #ifdef PADDLE_WITH_HIP #include #endif @@ -103,6 +106,15 @@ void SynchronizeAllDevice() { } SetDeviceId(pre_device_id); #endif +#ifdef PADDLE_WITH_MUSA + int pre_device_id = GetCurrentDeviceId(); + int count = GetGPUDeviceCount(); + for (int i = 0; i < count; i++) { + SetDeviceId(i); + PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize()); + } + SetDeviceId(pre_device_id); +#endif #ifdef PADDLE_WITH_HIP int pre_device_id = GetCurrentDeviceId(); int count = GetGPUDeviceCount(); diff --git a/paddle/fluid/platform/stream_callback_manager.h b/paddle/fluid/platform/stream_callback_manager.h index 7cd6930a9d0d0..10b0a1aded0d9 100644 --- a/paddle/fluid/platform/stream_callback_manager.h +++ b/paddle/fluid/platform/stream_callback_manager.h @@ -21,6 +21,11 @@ #include #endif +#ifdef PADDLE_WITH_MUSA +#include +#include +#endif + #ifdef PADDLE_WITH_HIP #include #endif diff --git a/paddle/fluid/pybind/cuda_streams_py.cc b/paddle/fluid/pybind/cuda_streams_py.cc index da6dee7657c09..de97b39218157 100644 --- a/paddle/fluid/pybind/cuda_streams_py.cc +++ b/paddle/fluid/pybind/cuda_streams_py.cc @@ -84,6 +84,8 @@ void BindCudaStream(py::module *m_ptr) { paddle::platform::SetDeviceId(device_id); #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize()); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); #endif diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index 8b4f4dcd62de1..b7375243d8db9 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -466,6 +466,9 @@ void SetTensorFromPyArrayT( #ifdef PADDLE_WITH_HIP paddle::platform::GpuMemcpySync( dst, array.data(), array.nbytes(), hipMemcpyHostToDevice); +#elif defined(PADDLE_WITH_MUSA) + paddle::platform::GpuMemcpySync( + dst, array.data(), array.nbytes(), musaMemcpyHostToDevice); #else paddle::platform::GpuMemcpySync( dst, array.data(), array.nbytes(), cudaMemcpyHostToDevice); diff --git a/paddle/phi/core/cuda_stream.h b/paddle/phi/core/cuda_stream.h index 87ab5e23818fb..26ec22f103a90 100644 --- a/paddle/phi/core/cuda_stream.h +++ b/paddle/phi/core/cuda_stream.h @@ -28,11 +28,6 @@ using gpuStream_t = cudaStream_t; using gpuStream_t = hipStream_t; #endif -#ifdef PADDLE_WITH_CUDA -#include -using gpuStream_t = cudaStream_t; -#endif - #ifdef PADDLE_WITH_MUSA #include using gpuStream_t = musaStream_t; From a1a54eeb607ddf12ebc04d8d9d138507c00a274f Mon Sep 17 00:00:00 2001 From: CaiZhi Date: Mon, 24 Jul 2023 12:12:15 +0000 Subject: [PATCH 07/55] [MTAI] build(system): enable build system in paddle for MUSA --- paddle/fluid/framework/conv_search_cache.h | 4 ++-- .../fluid/inference/api/analysis_predictor.cc | 2 +- .../tensorrt/plugin/qkv_to_context_plugin.cu | 2 +- .../fluid/memory/allocation/cuda_allocator.cc | 1 - .../cuda_device_context_allocator.h | 6 ++--- .../allocation/naive_best_fit_allocator.cc | 2 +- .../memory/allocation/pinned_allocator.cc | 4 ++-- .../memory/allocation/system_allocator.cc | 2 +- paddle/fluid/memory/memcpy.cc | 22 +++++++++---------- .../fluid/operators/class_center_sample_op.cu | 4 ++-- 10 files changed, 24 insertions(+), 25 deletions(-) diff --git a/paddle/fluid/framework/conv_search_cache.h b/paddle/fluid/framework/conv_search_cache.h index cbac8cac4e543..aca4ce5f23d8c 100644 --- a/paddle/fluid/framework/conv_search_cache.h +++ b/paddle/fluid/framework/conv_search_cache.h @@ -32,7 +32,7 @@ class ConvSearchCache { static ConvSearchCache instance; return instance; } -#if defined(PADDLE_WITH_HIP) +#ifdef PADDLE_WITH_HIP AlgorithmsCache* GetForward() { return &forward_cache_; } @@ -69,7 +69,7 @@ class ConvSearchCache { ConvSearchCache(const ConvSearchCache&) {} ConvSearchCache& operator=(const ConvSearchCache&) {} -#if defined(PADDLE_WITH_HIP) +#ifdef PADDLE_WITH_HIP AlgorithmsCache forward_cache_; AlgorithmsCache backward_data_cache_; AlgorithmsCache backward_filter_cache_; diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 12e893d72781f..12af725b6e407 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -2204,7 +2204,7 @@ void AnalysisPredictor::HookCollectShapeRangeInfo() { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) auto *dev_ctx = pool.Get(place_); auto stream = static_cast(dev_ctx)->stream(); -#if defined(PADDLE_WITH_HIP) +#ifdef PADDLE_WITH_HIP hipStreamSynchronize(stream); #elif defined(PADDLE_WITH_MUSA) musaStreamSynchronize(stream); diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu index ed2993e7a39e7..3c8f0694ee774 100644 --- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu @@ -215,7 +215,7 @@ void QkvToContextPluginDynamic::configurePlugin( fake_qk_bias_ = reinterpret_cast( tensor_.mutable_data(platform::CUDAPlace(device_id))); int64_t size = sizeof(int32_t) * batch * seq_len * seq_len * head_number_; -#if defined(PADDLE_WITH_HIP) +#ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipMemsetAsync(fake_qk_bias_, 0, size, dev_ctx.stream())); #elif defined(PADDLE_WITH_MUSA) diff --git a/paddle/fluid/memory/allocation/cuda_allocator.cc b/paddle/fluid/memory/allocation/cuda_allocator.cc index da5fdc829e8c0..51e6c88d55d50 100644 --- a/paddle/fluid/memory/allocation/cuda_allocator.cc +++ b/paddle/fluid/memory/allocation/cuda_allocator.cc @@ -24,7 +24,6 @@ #include #endif -#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP #include #endif diff --git a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h index 1401aeb7a11be..42e6f7be8de31 100644 --- a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h +++ b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h @@ -79,7 +79,7 @@ class GPUContextAllocator : public Allocator { gpuStream_t default_stream) : place_(place), default_stream_(default_stream) { platform::CUDADeviceGuard guard(place_.device); -#if defined(PADDLE_WITH_HIP) +#ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipEventCreateWithFlags(&event_, hipEventDisableTiming)); #elif defined(PADDLE_WITH_MUSA) @@ -94,7 +94,7 @@ class GPUContextAllocator : public Allocator { ~GPUContextAllocator() { if (event_) { platform::CUDADeviceGuard guard(place_.device); -#if defined(PADDLE_WITH_HIP) +#ifdef PADDLE_WITH_HIP PADDLE_WARN_GPU_SUCCESS(hipEventDestroy(event_)); #elif defined(PADDLE_WITH_MUSA) PADDLE_WARN_GPU_SUCCESS(musaEventDestroy(event_)); @@ -114,7 +114,7 @@ class GPUContextAllocator : public Allocator { auto allocation = new GPUContextAllocation( static_unique_ptr_cast(memory::Alloc(place_, size))); // Wait for the event on stream -#if defined(PADDLE_WITH_HIP) +#ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, default_stream_)); PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(default_stream_, event_, 0)); #elif defined(PADDLE_WITH_MUSA) diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc index a7af040f86c5f..93ebf7a1af16b 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc @@ -313,7 +313,7 @@ void *Alloc(const platform::CUDAPlace &place, string::HumanReadableSize(Used(place)))); } else { if (FLAGS_init_allocated_mem) { -#if defined(PADDLE_WITH_HIP) +#ifdef PADDLE_WITH_HIP hipMemset(ptr, 0xEF, size); #elif defined(PADDLE_WITH_MUSA) musaMemset(ptr, 0xEF, size); diff --git a/paddle/fluid/memory/allocation/pinned_allocator.cc b/paddle/fluid/memory/allocation/pinned_allocator.cc index 33c6ca55880cd..4737e5c565b45 100644 --- a/paddle/fluid/memory/allocation/pinned_allocator.cc +++ b/paddle/fluid/memory/allocation/pinned_allocator.cc @@ -21,7 +21,7 @@ namespace memory { namespace allocation { bool CPUPinnedAllocator::IsAllocThreadSafe() const { return true; } void CPUPinnedAllocator::FreeImpl(phi::Allocation *allocation) { -#if defined(PADDLE_WITH_HIP) +#ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipHostFree(allocation->ptr())); #elif defined(PADDLE_WITH_MUSA) PADDLE_ENFORCE_GPU_SUCCESS(musaHostFree(allocation->ptr())); @@ -37,7 +37,7 @@ void CPUPinnedAllocator::FreeImpl(phi::Allocation *allocation) { } phi::Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) { void *ptr; -#if defined(PADDLE_WITH_HIP) +#ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipHostMalloc(&ptr, size, hipHostMallocPortable)); #elif defined(PADDLE_WITH_MUSA) PADDLE_ENFORCE_GPU_SUCCESS(musaHostMalloc(&ptr, size, musaHostMallocPortable)); diff --git a/paddle/fluid/memory/allocation/system_allocator.cc b/paddle/fluid/memory/allocation/system_allocator.cc index 4234b615c823b..d67df333cfaba 100644 --- a/paddle/fluid/memory/allocation/system_allocator.cc +++ b/paddle/fluid/memory/allocation/system_allocator.cc @@ -214,7 +214,7 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) { void* p; // PINNED memory is visible to all CUDA contexts. -#if defined(PADDLE_WITH_HIP) +#ifdef PADDLE_WITH_HIP hipError_t result = hipHostMalloc(&p, size, hipHostMallocPortable); #elif defined(PADDLE_WITH_MUSA) musaError_t result = musaHostMalloc(&p, size, musaHostMallocPortable); diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc index 45b2ec3ca3875..b87cff7a7a429 100644 --- a/paddle/fluid/memory/memcpy.cc +++ b/paddle/fluid/memory/memcpy.cc @@ -259,7 +259,7 @@ void Copy(phi::Place dst_place, #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024; // 64K -#if defined(PADDLE_WITH_HIP) +#ifdef PADDLE_WITH_HIP inline void SyncCUDAStream() { #if !defined(_WIN32) hipStreamSynchronize(0); @@ -319,7 +319,7 @@ void Copy( if (stream) { platform::RecordEvent record_event( "GpuMemcpyAsync:GPU->CPU", platform::TracerEventType::UserDefined, 1); -#if defined(PADDLE_WITH_HIP) +#ifdef PADDLE_WITH_HIP platform::GpuMemcpyAsync(dst, src, num, @@ -341,7 +341,7 @@ void Copy( } else { platform::RecordEvent record_event( "GpuMemcpySync:GPU->CPU", platform::TracerEventType::UserDefined, 1); -#if defined(PADDLE_WITH_HIP) +#ifdef PADDLE_WITH_HIP platform::GpuMemcpySync(dst, src, num, hipMemcpyDeviceToHost); #elif defined(PADDLE_WITH_MUSA) platform::GpuMemcpySync(dst, src, num, musaMemcpyDeviceToHost); @@ -371,7 +371,7 @@ void Copy( if (stream) { platform::RecordEvent record_event( "GpuMemcpyAsync:CPU->GPU", platform::TracerEventType::UserDefined, 1); -#if defined(PADDLE_WITH_HIP) +#ifdef PADDLE_WITH_HIP platform::GpuMemcpyAsync(dst, src, num, @@ -393,7 +393,7 @@ void Copy( } else { platform::RecordEvent record_event( "GpuMemcpySync:CPU->GPU", platform::TracerEventType::UserDefined, 1); -#if defined(PADDLE_WITH_HIP) +#ifdef PADDLE_WITH_HIP platform::GpuMemcpySync(dst, src, num, hipMemcpyHostToDevice); #elif defined(PADDLE_WITH_MUSA) platform::GpuMemcpySync(dst, src, num, musaMemcpyHostToDevice); @@ -425,7 +425,7 @@ void Copy( platform::RecordEvent record_event("GpuMemcpyAsync(same_gpu):GPU->GPU", platform::TracerEventType::UserDefined, 1); -#if defined(PADDLE_WITH_HIP) +#ifdef PADDLE_WITH_HIP platform::GpuMemcpyAsync(dst, src, num, @@ -448,7 +448,7 @@ void Copy( platform::RecordEvent record_event("GpuMemcpySync(same_gpu):GPU->GPU", platform::TracerEventType::UserDefined, 1); -#if defined(PADDLE_WITH_HIP) +#ifdef PADDLE_WITH_HIP platform::GpuMemcpySync(dst, src, num, hipMemcpyDeviceToDevice); #elif defined(PADDLE_WITH_MUSA) platform::GpuMemcpySync(dst, src, num, musaMemcpyDeviceToDevice); @@ -532,7 +532,7 @@ void Copy( platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CUDAPinned", platform::TracerEventType::UserDefined, 1); -#if defined(PADDLE_WITH_HIP) +#ifdef PADDLE_WITH_HIP platform::GpuMemcpyAsync(dst, src, num, @@ -555,7 +555,7 @@ void Copy( platform::RecordEvent record_event("GpuMemcpySync:GPU->CUDAPinned", platform::TracerEventType::UserDefined, 1); -#if defined(PADDLE_WITH_HIP) +#ifdef PADDLE_WITH_HIP platform::GpuMemcpySync(dst, src, num, hipMemcpyDeviceToHost); #elif defined(PADDLE_WITH_MUSA) platform::GpuMemcpySync(dst, src, num, musaMemcpyDeviceToHost); @@ -582,7 +582,7 @@ void Copy( platform::RecordEvent record_event("GpuMemcpyAsync:CUDAPinned->GPU", platform::TracerEventType::UserDefined, 1); -#if defined(PADDLE_WITH_HIP) +#ifdef PADDLE_WITH_HIP platform::GpuMemcpyAsync(dst, src, num, @@ -605,7 +605,7 @@ void Copy( platform::RecordEvent record_event("GpuMemcpySync:CUDAPinned->GPU", platform::TracerEventType::UserDefined, 1); -#if defined(PADDLE_WITH_HIP) +#ifdef PADDLE_WITH_HIP platform::GpuMemcpySync(dst, src, num, hipMemcpyHostToDevice); #elif defined(PADDLE_WITH_MUSA) platform::GpuMemcpySync(dst, src, num, musaMemcpyHostToDevice); diff --git a/paddle/fluid/operators/class_center_sample_op.cu b/paddle/fluid/operators/class_center_sample_op.cu index 5327be6909b4f..2c4b4f1ceacf6 100644 --- a/paddle/fluid/operators/class_center_sample_op.cu +++ b/paddle/fluid/operators/class_center_sample_op.cu @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#if defined(PADDLE_WITH_HIP) +#ifdef PADDLE_WITH_HIP #include #include @@ -72,7 +72,7 @@ __global__ void RandomSampleClassCenter(const int64_t n, size_t local_seed = (static_cast(seed) + 0x9E3779B9U + (static_cast(id) << 6U) + (static_cast(id) >> 2U)); -#if defined(PADDLE_WITH_HIP) +#ifdef PADDLE_WITH_HIP hiprand_init(local_seed, id, increment, &localState); CUDA_KERNEL_LOOP(i, n) { buffer[i] = static_cast(hiprand(&localState) % max_val); From 7e92cf78a4775e7aeb0a6d13e3b1f1c9150ed9c0 Mon Sep 17 00:00:00 2001 From: Xiaokang Shang Date: Wed, 26 Jul 2023 10:11:12 +0000 Subject: [PATCH 08/55] change kernels --- paddle/phi/kernels/autotune/gpu_timer.h | 16 ++++++ paddle/phi/kernels/batch_norm_kernel.cc | 2 +- paddle/phi/kernels/coalesce_tensor_kernel.cc | 2 +- paddle/phi/kernels/cpu/gelu_grad_kernel.cc | 2 +- paddle/phi/kernels/cpu/gelu_kernel.cc | 2 +- paddle/phi/kernels/funcs/blas/blas.h | 9 ++-- paddle/phi/kernels/funcs/blas/blas_impl.h | 4 +- paddle/phi/kernels/funcs/dropout_impl.cu.h | 19 +++++++ paddle/phi/kernels/funcs/embedding_grad.h | 6 +-- paddle/phi/kernels/funcs/fft.cu | 7 ++- paddle/phi/kernels/funcs/fft_cache.h | 2 + paddle/phi/kernels/funcs/layer_norm_impl.cu.h | 4 +- paddle/phi/kernels/funcs/math_cuda_utils.h | 3 ++ paddle/phi/kernels/funcs/select_impl.cu.h | 3 ++ paddle/phi/kernels/funcs/softmax.cu | 4 ++ paddle/phi/kernels/funcs/sparse/sparse_blas.h | 3 ++ .../fusion/gpu/fused_dropout_add_kernel.cu | 4 ++ .../fusion/gpu/fused_softmax_mask_utils.h | 6 ++- .../phi/kernels/gpu/activation_grad_kernel.cu | 2 +- paddle/phi/kernels/gpu/activation_kernel.cu | 2 +- paddle/phi/kernels/gpu/allclose_kernel.cu | 2 + .../phi/kernels/gpu/batch_norm_grad_kernel.cu | 29 +++++++++-- paddle/phi/kernels/gpu/batch_norm_kernel.cu | 22 ++++++-- .../kernels/gpu/cross_entropy_grad_kernel.cu | 2 +- .../phi/kernels/gpu/cross_entropy_kernel.cu | 12 +++++ .../phi/kernels/gpu/cudnn_lstm_grad_kernel.cu | 10 +++- paddle/phi/kernels/gpu/cudnn_lstm_kernel.cu | 51 ++++++++++++++++++- paddle/phi/kernels/gpu/cudnn_lstm_utils.h | 3 ++ paddle/phi/kernels/gpu/cum_grad_kernel.cu | 2 +- paddle/phi/kernels/gpu/dirichlet_kernel.cu | 7 +++ .../phi/kernels/gpu/embedding_grad_kernel.cu | 3 ++ .../phi/kernels/gpu/graph_reindex_kernel.cu | 22 ++++++-- .../gpu/graph_sample_neighbors_kernel.cu | 17 +++++++ .../kernels/gpu/graph_send_ue_recv_funcs.h | 9 ++++ paddle/phi/kernels/gpu/group_norm_kernel.cu | 3 ++ .../kernels/gpu/instance_norm_grad_kernel.cu | 29 ++++++++++- .../phi/kernels/gpu/instance_norm_kernel.cu | 30 ++++++++++- .../phi/kernels/gpu/layer_norm_grad_kernel.cu | 2 +- paddle/phi/kernels/gpu/layer_norm_kernel.cu | 4 +- .../kernels/gpu/log_softmax_grad_kernel.cu | 7 +++ paddle/phi/kernels/gpu/log_softmax_kernel.cu | 7 +++ .../kernels/gpu/logcumsumexp_grad_kernel.cu | 2 +- .../phi/kernels/gpu/logsumexp_function.cu.h | 40 ++++++++++++++- .../phi/kernels/gpu/nll_loss_grad_kernel.cu | 2 + paddle/phi/kernels/gpu/nll_loss_kernel.cu | 2 + paddle/phi/kernels/gpu/rnn_functor.h | 36 +++++++++++++ paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc | 16 ++++++ paddle/phi/kernels/gpu/rnn_kernel.cu.cc | 22 +++++++- .../kernels/gpu/send_u_recv_grad_kernel.cu | 2 + paddle/phi/kernels/gpu/send_u_recv_kernel.cu | 4 ++ .../kernels/gpu/send_ue_recv_grad_kernel.cu | 23 +++++++++ paddle/phi/kernels/gpu/send_ue_recv_kernel.cu | 6 ++- paddle/phi/kernels/gpu/send_uv_grad_kernel.cu | 13 +++++ paddle/phi/kernels/gpu/top_k_kernel.cu | 4 +- .../kernels/gpudnn/affine_grid_grad_kernel.cu | 2 +- paddle/phi/kernels/gpudnn/conv_grad_kernel.cu | 40 +++++++++------ paddle/phi/kernels/gpudnn/conv_kernel.cu | 15 ++++-- .../gpudnn/conv_transpose_grad_kernel.cu | 51 +++++++++++-------- .../kernels/gpudnn/conv_transpose_kernel.cu | 19 +++++-- paddle/phi/kernels/gpudnn/pool_grad_kernel.cu | 25 +++++++-- paddle/phi/kernels/gpudnn/pool_kernel.cu | 26 ++++++++-- .../phi/kernels/gpudnn/softmax_grad_kernel.cu | 2 +- paddle/phi/kernels/impl/conv_cudnn_impl.h | 2 + paddle/phi/kernels/impl/isclose_kernel_impl.h | 2 + .../kernels/impl/segment_pool_kernel_impl.h | 5 ++ .../kernels/primitive/datamover_primitives.h | 4 ++ paddle/phi/kernels/reduce_min_kernel.cc | 5 ++ .../kernels/sparse/batch_norm_grad_kernel.cc | 2 +- .../phi/kernels/sparse/batch_norm_kernel.cc | 2 +- .../phi/kernels/sparse/gpu/coalesce_kernel.cu | 4 ++ paddle/phi/kernels/sparse/gpu/conv.cu.h | 6 +++ .../phi/kernels/sparse/gpu/convolution.cu.h | 29 +++++++++++ .../kernels/sparse/gpu/elementwise_kernel.cu | 2 + .../kernels/sparse/gpu/matmul_grad_kernel.cu | 10 +++- paddle/phi/kernels/sparse/gpu/pool_kernel.cu | 2 + paddle/phi/kernels/sparse/gpu/slice_kernel.cu | 12 +++++ .../kernels/sparse/gpu/softmax_grad_kernel.cu | 3 ++ .../kernels/sparse/gpu/sparse_utils_kernel.cu | 14 +++-- paddle/phi/kernels/strings/gpu/copy_utils.h | 6 +++ paddle/phi/kernels/strings/unicode.cc | 8 +++ 80 files changed, 740 insertions(+), 105 deletions(-) diff --git a/paddle/phi/kernels/autotune/gpu_timer.h b/paddle/phi/kernels/autotune/gpu_timer.h index 87eca2613a7b5..3817e62791c47 100644 --- a/paddle/phi/kernels/autotune/gpu_timer.h +++ b/paddle/phi/kernels/autotune/gpu_timer.h @@ -23,6 +23,9 @@ #ifdef PADDLE_WITH_HIP #include #endif +#ifdef PADDLE_WITH_MUSA +#include +#endif namespace phi { @@ -32,6 +35,9 @@ class GpuTimer { #ifdef PADDLE_WITH_HIP hipEventCreate(&start_); hipEventCreate(&stop_); +#elif defined(PADDLE_WITH_MUSA) + musaEventCreate(&start_); + musaEventCreate(&stop_); #else cudaEventCreate(&start_); cudaEventCreate(&stop_); @@ -46,6 +52,9 @@ class GpuTimer { #ifdef PADDLE_WITH_HIP hipEventDestroy(start_); hipEventDestroy(stop_); +#elif defined(PADDLE_WITH_MUSA) + musaEventDestroy(start_); + musaEventDestroy(stop_); #else cudaEventDestroy(start_); cudaEventDestroy(stop_); @@ -55,6 +64,8 @@ class GpuTimer { void Start(gpuStream_t stream) { #ifdef PADDLE_WITH_HIP hipEventRecord(start_, stream); +#elif defined(PADDLE_WITH_MUSA) + musaEventRecord(start_, stream); #else cudaEventRecord(start_, stream); #endif @@ -63,6 +74,8 @@ class GpuTimer { void Stop(gpuStream_t stream) { #ifdef PADDLE_WITH_HIP hipEventRecord(stop_, stream); +#elif defined(PADDLE_WITH_MUSA) + musaEventRecord(stop_, stream); #else cudaEventRecord(stop_, stream); #endif @@ -73,6 +86,9 @@ class GpuTimer { #ifdef PADDLE_WITH_HIP hipEventSynchronize(stop_); hipEventElapsedTime(&milliseconds, start_, stop_); +#elif defined(PADDLE_WITH_MUSA) + musaEventSynchronize(stop_); + musaEventElapsedTime(&milliseconds, start_, stop_); #else cudaEventSynchronize(stop_); cudaEventElapsedTime(&milliseconds, start_, stop_); diff --git a/paddle/phi/kernels/batch_norm_kernel.cc b/paddle/phi/kernels/batch_norm_kernel.cc index bf04c99dab0a3..dba08b0de366a 100644 --- a/paddle/phi/kernels/batch_norm_kernel.cc +++ b/paddle/phi/kernels/batch_norm_kernel.cc @@ -97,7 +97,7 @@ PD_REGISTER_KERNEL(batch_norm_infer, } #endif #endif -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(batch_norm_infer, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/coalesce_tensor_kernel.cc b/paddle/phi/kernels/coalesce_tensor_kernel.cc index 8dcd3a1d995d8..58cacd21bba18 100644 --- a/paddle/phi/kernels/coalesce_tensor_kernel.cc +++ b/paddle/phi/kernels/coalesce_tensor_kernel.cc @@ -292,7 +292,7 @@ PD_REGISTER_KERNEL(coalesce_tensor, } #endif -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(coalesce_tensor, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/cpu/gelu_grad_kernel.cc b/paddle/phi/kernels/cpu/gelu_grad_kernel.cc index 65ee3c1851003..81ed7170d7a24 100644 --- a/paddle/phi/kernels/cpu/gelu_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/gelu_grad_kernel.cc @@ -64,7 +64,7 @@ struct GeluGradFunctor { } else { #if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \ !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) + !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) auto x_data = x.data(); auto dx_data = dx.data(); auto dout_data = dout.data(); diff --git a/paddle/phi/kernels/cpu/gelu_kernel.cc b/paddle/phi/kernels/cpu/gelu_kernel.cc index dbab3bd326664..47ab1a7839066 100644 --- a/paddle/phi/kernels/cpu/gelu_kernel.cc +++ b/paddle/phi/kernels/cpu/gelu_kernel.cc @@ -53,7 +53,7 @@ struct GeluFunctor { } else { #if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \ !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) + !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) auto x_data = x.data(); auto out_data = out.data(); int n = std::min(x.size(), out.size()); diff --git a/paddle/phi/kernels/funcs/blas/blas.h b/paddle/phi/kernels/funcs/blas/blas.h index 2ea7a306f16fd..3b758882e4072 100644 --- a/paddle/phi/kernels/funcs/blas/blas.h +++ b/paddle/phi/kernels/funcs/blas/blas.h @@ -175,7 +175,7 @@ class Blas { T* c, const int* ldc) const; -#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) +#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) template void MatMulWithHead(const phi::DenseTensor& mat_a, const MatDescriptor& dim_a, @@ -303,7 +303,7 @@ class Blas { int batchCount) const; #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) + !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) template void BatchedGEMMWithHead(CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, @@ -445,7 +445,7 @@ class BlasT : private Blas { Base()->template CSRMM(args...); } -#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) +#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) template void MatMulWithHead(ARGS... args) const { Base()->template MatMulWithHead(args...); @@ -593,3 +593,6 @@ inline BlasT GetBlas(const DeviceContext& dev_ctx) { #ifdef PADDLE_WITH_HIP #include "paddle/phi/kernels/funcs/blas/blas_impl.hip.h" #endif +#ifdef PADDLE_WITH_MUSA +// TODO +#endif diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.h b/paddle/phi/kernels/funcs/blas/blas_impl.h index ffafe15b8fcf2..5e4c058ee589b 100644 --- a/paddle/phi/kernels/funcs/blas/blas_impl.h +++ b/paddle/phi/kernels/funcs/blas/blas_impl.h @@ -1452,7 +1452,7 @@ void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, } #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) // @{ Group Blas MKLML: BatchedGEMMWithHead + !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) // @{ Group Blas MKLML: BatchedGEMMWithHead template <> template void Blas::BatchedGEMMWithHead(CBLAS_TRANSPOSE transA, @@ -1698,7 +1698,7 @@ void Blas::MatMul(const T *mat_a, } #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) + !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) // @{ Group Blas MKLML: MatMulWithHead /* * Multiple two matrixes with multiple heads diff --git a/paddle/phi/kernels/funcs/dropout_impl.cu.h b/paddle/phi/kernels/funcs/dropout_impl.cu.h index a1fc2c225ecf2..b4387e594d577 100644 --- a/paddle/phi/kernels/funcs/dropout_impl.cu.h +++ b/paddle/phi/kernels/funcs/dropout_impl.cu.h @@ -24,6 +24,10 @@ limitations under the License. */ #include #include #endif +#ifdef PADDLE_WITH_MUSA +#include +#include +#endif #include "paddle/phi/kernels/funcs/dropout_impl_util.h" @@ -142,6 +146,10 @@ __global__ void VectorizedRandomGenerator(const size_t n, hiprandStatePhilox4_32_10_t state; hiprand_init(seed, idx + THREAD_ID_X, increment, &state); using SType = hiprandStatePhilox4_32_10_t; +#elif defined(PADDLE_WITH_MUSA) + murand_state_philox4x32_10 state; + murand_init(seed, idx + THREAD_ID_X, increment, &state); + using SType = murand_state_philox4x32_10; #else curandStatePhilox4_32_10_t state; curand_init(seed, idx + THREAD_ID_X, increment, &state); @@ -212,6 +220,10 @@ __global__ void VectorizedGeneratorMask(const size_t n, hiprandStatePhilox4_32_10_t state; hiprand_init(seed, idx + THREAD_ID_X, increment, &state); using SType = hiprandStatePhilox4_32_10_t; +#elif defined(PADDLE_WITH_MUSA) + murand_state_philox4x32_10 state; + murand_init(seed, idx + THREAD_ID_X, increment, &state); + using SType = murand_state_philox4x32_10; #else curandStatePhilox4_32_10_t state; curand_init(seed, idx + THREAD_ID_X, increment, &state); @@ -295,6 +307,11 @@ void DropoutFwGPUKernelDriver( hipMemsetAsync(y_data, 0, x_numel * sizeof(T), stream)); PADDLE_ENFORCE_GPU_SUCCESS( hipMemsetAsync(mask_data, 0, x_numel * sizeof(*mask_data), stream)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaMemsetAsync(y_data, 0, x_numel * sizeof(T), stream)); + PADDLE_ENFORCE_GPU_SUCCESS( + musaMemsetAsync(mask_data, 0, x_numel * sizeof(*mask_data), stream)); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaMemsetAsync(y_data, 0, x_numel * sizeof(T), stream)); @@ -430,6 +447,8 @@ void DropoutGradGPUKernelDriver(const phi::GPUContext& dev_ctx, if (upscale_in_train && dropout_prob == 1.0f) { #ifdef PADDLE_WITH_HIP hipMemset(grad_x->data(), 0, grad_x->numel() * sizeof(T)); +#elif defined(PADDLE_WITH_MUSA) + musaMemset(grad_x->data(), 0, grad_x->numel() * sizeof(T)); #else cudaMemset(grad_x->data(), 0, grad_x->numel() * sizeof(T)); #endif diff --git a/paddle/phi/kernels/funcs/embedding_grad.h b/paddle/phi/kernels/funcs/embedding_grad.h index 3ad0f22c8e912..8d43553325277 100644 --- a/paddle/phi/kernels/funcs/embedding_grad.h +++ b/paddle/phi/kernels/funcs/embedding_grad.h @@ -96,7 +96,7 @@ __global__ void EmbeddingGradDeterministicKernel(T* table, unsigned long long int matchmask = // NOLINT __ballot(match_found_this_thread); // NOLINT int first_remaining_peer = __ffsll(matchmask) - 1; -#else +#else // MUSA and CUDA // If and only if match_found_this_thread of the Nth thread is non-zero, // set the Nth bit of matchmask to 1. unsigned int matchmask = @@ -112,7 +112,7 @@ __global__ void EmbeddingGradDeterministicKernel(T* table, while (matchmask) { #ifdef PADDLE_WITH_HIP first_remaining_peer = __ffsll(matchmask) - 1; -#else +#else // CUDA and MUSA first_remaining_peer = __ffs(matchmask) - 1; #endif my_s[threadIdx.x] += @@ -142,7 +142,7 @@ void LaunchEmbeddingGradDeterministicKernel(const GPUContext& ctx, #ifdef PADDLE_WITH_HIP constexpr int kWarpSize = 64; constexpr int kBlockDimY = 16; -#else +#else // CUDA and MUSA constexpr int kWarpSize = 32; constexpr int kBlockDimY = 32; #endif diff --git a/paddle/phi/kernels/funcs/fft.cu b/paddle/phi/kernels/funcs/fft.cu index edac497bc8e8b..42786f4b64355 100644 --- a/paddle/phi/kernels/funcs/fft.cu +++ b/paddle/phi/kernels/funcs/fft.cu @@ -104,7 +104,7 @@ inline bool use_cache(const int64_t* signal_size) { } return using_cache; } -#elif defined(PADDLE_WITH_HIP) +#elif defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) inline bool use_cache(const int64_t* signal_size) { return true; } #endif @@ -200,6 +200,11 @@ void exec_fft(const phi::GPUContext& ctx, phi::dynload::hipfftSetStream(config->plan(), ctx.stream())); PADDLE_ENFORCE_GPU_SUCCESS( phi::dynload::hipfftSetWorkArea(config->plan(), workspace_tensor.data())); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::mufftSetStream(config->plan(), ctx.stream())); + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::mufftSetWorkArea(config->plan(), workspace_tensor.data())); #endif // execution of fft plan diff --git a/paddle/phi/kernels/funcs/fft_cache.h b/paddle/phi/kernels/funcs/fft_cache.h index 51e90a6c0d95b..a6f775af88ea7 100644 --- a/paddle/phi/kernels/funcs/fft_cache.h +++ b/paddle/phi/kernels/funcs/fft_cache.h @@ -25,6 +25,8 @@ #include "paddle/phi/kernels/funcs/cufft_util.h" #elif defined(PADDLE_WITH_HIP) #include "paddle/phi/kernels/funcs/hipfft_util.h" +#elif defined(PADDLE_WITH_MUSA) +#include "paddle/phi/kernels/funcs/mufft_util.h" #endif namespace phi { diff --git a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h index 1d067b0fc2918..b7aa46dcb004e 100644 --- a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h +++ b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h @@ -1350,7 +1350,7 @@ __global__ void LayerNormBackwardComputeGradInput(const T *__restrict__ dout, // WARP_SHFL_XOR(sum_loss, mask); sum_loss1 += __shfl_xor(sum_loss1, mask, warpSize); sum_loss2 += __shfl_xor(sum_loss2, mask, warpSize); -#else +#else // CUDA and MUSA // WARP_SHFL_XOR(sum_loss, mask); sum_loss1 += __shfl_xor_sync(0xffffffff, sum_loss1, mask, warpSize); sum_loss2 += __shfl_xor_sync(0xffffffff, sum_loss2, mask, warpSize); @@ -1501,7 +1501,7 @@ __global__ void LayerNormBackwardComputeGradInputWithSmallFeatureSize( // WARP_SHFL_XOR(sum_loss, mask); sum_loss1 += __shfl_xor(sum_loss1, mask, warpSize); sum_loss2 += __shfl_xor(sum_loss2, mask, warpSize); -#else +#else // CUDA and MUSA // WARP_SHFL_XOR(sum_loss, mask); sum_loss1 += __shfl_xor_sync(0xffffffff, sum_loss1, mask, WarpSize); sum_loss2 += __shfl_xor_sync(0xffffffff, sum_loss2, mask, WarpSize); diff --git a/paddle/phi/kernels/funcs/math_cuda_utils.h b/paddle/phi/kernels/funcs/math_cuda_utils.h index 1a6cca7f11aae..d9fb6de531557 100644 --- a/paddle/phi/kernels/funcs/math_cuda_utils.h +++ b/paddle/phi/kernels/funcs/math_cuda_utils.h @@ -20,6 +20,9 @@ limitations under the License. */ #ifdef PADDLE_WITH_HIP #include #endif +#ifdef PADDLE_WITH_MUSA +#include +#endif #include diff --git a/paddle/phi/kernels/funcs/select_impl.cu.h b/paddle/phi/kernels/funcs/select_impl.cu.h index 2976968d07b70..b3684c4d4e0ba 100644 --- a/paddle/phi/kernels/funcs/select_impl.cu.h +++ b/paddle/phi/kernels/funcs/select_impl.cu.h @@ -23,6 +23,9 @@ #include namespace cub = hipcub; #endif +#ifdef __MCC__ +//TODO +#endif #include #include "paddle/phi/backends/gpu/gpu_launch_config.h" diff --git a/paddle/phi/kernels/funcs/softmax.cu b/paddle/phi/kernels/funcs/softmax.cu index 2ca97cd4ac205..55c24e8c980ff 100644 --- a/paddle/phi/kernels/funcs/softmax.cu +++ b/paddle/phi/kernels/funcs/softmax.cu @@ -60,6 +60,8 @@ void SoftmaxCUDNNFunctor::operator()( context.template Alloc(Y), MIOPEN_SOFTMAX_ACCURATE, MIOPEN_SOFTMAX_MODE_INSTANCE)); +#elif defined(PADDLE_WITH_MUSA) + // TODO #else cudnnTensorDescriptor_t cudnn_x_desc = xDesc.descriptor(layout, cudnn_tensor_dims); @@ -117,6 +119,8 @@ void SoftmaxGradCUDNNFunctor::operator()( context.template Alloc(XGrad), MIOPEN_SOFTMAX_ACCURATE, MIOPEN_SOFTMAX_MODE_INSTANCE)); +#elif defined(PADDLE_WITH_MUSA) + // TODO #else cudnnTensorDescriptor_t cudnn_y_desc = yDesc.descriptor(layout, cudnn_tensor_dims); diff --git a/paddle/phi/kernels/funcs/sparse/sparse_blas.h b/paddle/phi/kernels/funcs/sparse/sparse_blas.h index f6d67488d1f48..9a6534c32a1c6 100644 --- a/paddle/phi/kernels/funcs/sparse/sparse_blas.h +++ b/paddle/phi/kernels/funcs/sparse/sparse_blas.h @@ -100,3 +100,6 @@ inline SparseBlasT GetSparseBlas( #if defined(PADDLE_WITH_HIP) && HIP_VERSION >= 402 #include "paddle/phi/kernels/funcs/sparse/sparse_blas_impl.hip.h" #endif +#if defined(PADDLE_WITH_MUSA) +#include "paddle/phi/kernels/funcs/sparse/sparse_blas_impl.mu.h" +#endif diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu index 3cb1a6742543a..85dc7d31f2064 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu @@ -91,6 +91,10 @@ __global__ void VectorizedDropoutForward(const size_t n, hiprandStatePhilox4_32_10_t state; hiprand_init(seed, idx + THREAD_ID_X, increment, &state); using SType = hiprandStatePhilox4_32_10_t; +#elif defined(PADDLE_WITH_MUSA) + murand_state_philox4x32_10 state; + murand_init(seed, idx + THREAD_ID_X, increment, &state); + using SType = murand_state_philox4x32_10; #else curandStatePhilox4_32_10_t state; curand_init(seed, idx + THREAD_ID_X, increment, &state); diff --git a/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_utils.h b/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_utils.h index 418fa8bf55ce9..9c5e336a9f148 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_utils.h +++ b/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_utils.h @@ -22,6 +22,10 @@ #include #include #endif +#ifdef PADDLE_WITH_MUSA +#include +#include +#endif #include "paddle/phi/kernels/funcs/aligned_vector.h" @@ -29,7 +33,7 @@ #ifdef PADDLE_WITH_HIP #define WARP_SIZE 64 -#else +#else // MUSA & CUDA #define WARP_SIZE 32 #endif diff --git a/paddle/phi/kernels/gpu/activation_grad_kernel.cu b/paddle/phi/kernels/gpu/activation_grad_kernel.cu index aa703ede3bad6..3eff633ff0c51 100644 --- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu @@ -299,7 +299,7 @@ void HardSwishGradKernel(const Context& dev_ctx, } // namespace phi -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(relu_grad, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/gpu/activation_kernel.cu b/paddle/phi/kernels/gpu/activation_kernel.cu index 83e130f0a71bd..d741549bebcf9 100644 --- a/paddle/phi/kernels/gpu/activation_kernel.cu +++ b/paddle/phi/kernels/gpu/activation_kernel.cu @@ -186,7 +186,7 @@ PD_REGISTER_KERNEL(relu, float, double, phi::dtype::float16) {} -#else +#else // CUDA & MUSA PD_REGISTER_KERNEL(relu, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/gpu/allclose_kernel.cu b/paddle/phi/kernels/gpu/allclose_kernel.cu index 99ccfcd8667e6..13a65c6a64f8b 100644 --- a/paddle/phi/kernels/gpu/allclose_kernel.cu +++ b/paddle/phi/kernels/gpu/allclose_kernel.cu @@ -87,6 +87,8 @@ void AllCloseKernel(const Context& dev_ctx, grid = (grid > block) ? block : grid; #ifdef PADDLE_WITH_HIP hipMemset(out_data, true, sizeof(bool)); +#elif defined(PADDLE_WITH_MUSA) + musaMemset(out_data, true, sizeof(bool)); #else cudaMemset(out_data, true, sizeof(bool)); #endif diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu index b940374556009..7546ebbaf736c 100644 --- a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu @@ -568,7 +568,7 @@ void BatchNormGradRawKernel(const Context &ctx, scale.dims()[0])); auto dtype = phi::backends::gpu::CudnnDataType::type; -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) auto compute_format = data_layout == DataLayout::kNHWC ? DataLayout::kNHWC : DataLayout::kNCHW; @@ -650,6 +650,15 @@ void BatchNormGradRawKernel(const Context &ctx, // platform::dynload::miopenCreateTensorDescriptor(&data_desc_)); // PADDLE_ENFORCE_GPU_SUCCESS( // platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_)); +#elif defined(PADDLE_WITH_MUSA) + mudnnTensorDescriptor_t data_desc_; + mudnnTensorDescriptor_t bn_param_desc_; + mudnnBatchNormMode_t mode_; + + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::mudnnCreateTensorDescriptor(&data_desc_)); + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::mudnnCreateTensorDescriptor(&bn_param_desc_)); #else cudnnTensorDescriptor_t data_desc_; cudnnTensorDescriptor_t bn_param_desc_; @@ -694,6 +703,15 @@ void BatchNormGradRawKernel(const Context &ctx, // PADDLE_ENFORCE_GPU_SUCCESS( // platform::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_, // data_desc_, mode_)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnSetTensorNdDescriptor( + data_desc_, + CudnnDataType::type, + x_dims.size() > 3 ? x_dims.size() : 4, + dims.data(), + strides.data())); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnDeriveBNTensorDescriptor( + bn_param_desc_, data_desc_, mode_)); #else PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensorNdDescriptor( data_desc_, @@ -1113,6 +1131,11 @@ void BatchNormGradRawKernel(const Context &ctx, // platform::dynload::miopenDestroyTensorDescriptor(data_desc_)); // PADDLE_ENFORCE_GPU_SUCCESS( // platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::mudnnDestroyTensorDescriptor(data_desc_)); + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::mudnnDestroyTensorDescriptor(bn_param_desc_)); #else // clean when exit. PADDLE_ENFORCE_GPU_SUCCESS( @@ -1407,7 +1430,7 @@ PD_REGISTER_KERNEL(batch_norm_grad_raw, kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); // bias_grad } } -#else +#else // CUDA & MUSA PD_REGISTER_KERNEL(batch_norm_grad, GPU, ALL_LAYOUT, @@ -1445,7 +1468,7 @@ PD_REGISTER_KERNEL(batch_norm_double_grad, phi::BatchNormDoubleGradKernel, float, double) {} -#else +#else // CUDA & MUSA PD_REGISTER_KERNEL(batch_norm_double_grad, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu index 5c2d76be35992..2a4a435f9c96a 100644 --- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu @@ -551,7 +551,7 @@ void BatchNormKernel(const Context &ctx, auto dtype = phi::backends::gpu::CudnnDataType::type; -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) auto compute_format = data_layout == DataLayout::kNHWC ? DataLayout::kNHWC : DataLayout::kNCHW; @@ -593,6 +593,15 @@ void BatchNormKernel(const Context &ctx, // platform::dynload::miopenCreateTensorDescriptor(&data_desc_)); // PADDLE_ENFORCE_GPU_SUCCESS( // platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_)); +#elif defined(PADDLE_WITH_MUSA) + mudnnTensorDescriptor_t data_desc_; + mudnnTensorDescriptor_t bn_param_desc_; + mudnnBatchNormMode_t mode_; + + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::mudnnCreateTensorDescriptor(&data_desc_)); + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::mudnnCreateTensorDescriptor(&bn_param_desc_)); #else cudnnTensorDescriptor_t data_desc_; cudnnTensorDescriptor_t bn_param_desc_; @@ -641,7 +650,7 @@ void BatchNormKernel(const Context &ctx, strides = {H * W * D * C, 1, W * D * C, D * C, C}; } -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) // TODO(wangran16): wait for MIOpen to improve the performance of BN // PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor( // data_desc_, CudnnDataType::type, @@ -942,7 +951,7 @@ void BatchNormKernel(const Context &ctx, // ctx.GetPlace())), // static_cast(saved_variance->template mutable_data< // BatchNormParamType>(ctx.GetPlace())))); -#else +#else // CUDA & MUSA // const size_t CUDNN_PER_ACTIVATION_THRESHOLD = 131070; const bool use_native_kernel = ((x_dims.size() == 2 && N >= CUDNN_PER_ACTIVATION_THRESHOLD) || @@ -1206,6 +1215,11 @@ void BatchNormKernel(const Context &ctx, // platform::dynload::miopenDestroyTensorDescriptor(data_desc_)); // PADDLE_ENFORCE_GPU_SUCCESS( // platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::mudnnDestroyTensorDescriptor(data_desc_)); + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::mudnnDestroyTensorDescriptor(bn_param_desc_)); #else // clean when exit. PADDLE_ENFORCE_GPU_SUCCESS( @@ -1256,7 +1270,7 @@ PD_REGISTER_KERNEL(batch_norm, kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32); } } -#else +#else // CUDA & MUSA PD_REGISTER_KERNEL(batch_norm, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu b/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu index 21dedeb94a62c..3a144b3ba7a40 100644 --- a/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu @@ -289,7 +289,7 @@ PD_REGISTER_KERNEL(cross_entropy_with_softmax_grad, float, double, phi::dtype::float16) {} -#else +#else // CUDA & MUSA #if CUDNN_VERSION_MIN(8, 1, 0) PD_REGISTER_KERNEL(cross_entropy_with_softmax_grad, GPU, diff --git a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu index f8964f4ec5312..3bd4595c48b21 100644 --- a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu +++ b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu @@ -763,6 +763,8 @@ static void SoftmaxWithCrossEntropySoftLabel(const GPUContext& dev_ctx, GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW; #ifdef PADDLE_WITH_HIP miopenTensorDescriptor_t descp = desc.descriptor(layout, tensor_dims); +#elif defined(PADDLE_WITH_MUSA) + mudnnTensorDescriptor_t descp = desc.descriptor(layout, tensor_dims); #else cudnnTensorDescriptor_t descp = desc.descriptor(layout, tensor_dims); #endif @@ -782,12 +784,20 @@ static void SoftmaxWithCrossEntropySoftLabel(const GPUContext& dev_ctx, softmax_data, MIOPEN_SOFTMAX_LOG, mode)); +#else +#ifdef PADDLE_WITH_MUSA + auto mode = axis == rank - 1 ? MUDNN_SOFTMAX_MODE_INSTANCE + : MUDNN_SOFTMAX_MODE_CHANNEL; + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnSoftmaxForward( + handle, + MUDNN_SOFTMAX_LOG, #else auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE : CUDNN_SOFTMAX_MODE_CHANNEL; PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSoftmaxForward( handle, CUDNN_SOFTMAX_LOG, +#endif mode, phi::backends::gpu::CudnnDataType::kOne(), descp, @@ -1199,6 +1209,8 @@ static void SoftmaxWithCrossEntropyHardLabel(const GPUContext& dev_ctx, GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW; #ifdef PADDLE_WITH_HIP miopenTensorDescriptor_t descp = desc.descriptor(layout, tensor_dims); +#elif defined(PADDLE_WITH_MUSA) + mudnnTensorDescriptor_t descp = desc.descriptor(layout, tensor_dims); #else cudnnTensorDescriptor_t descp = desc.descriptor(layout, tensor_dims); #endif diff --git a/paddle/phi/kernels/gpu/cudnn_lstm_grad_kernel.cu b/paddle/phi/kernels/gpu/cudnn_lstm_grad_kernel.cu index 661a1dd90e7e9..ff344fb47dcd6 100644 --- a/paddle/phi/kernels/gpu/cudnn_lstm_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/cudnn_lstm_grad_kernel.cu @@ -195,7 +195,11 @@ void CudnnLSTMGradKernel( reserve_size)); #else PADDLE_ENFORCE_GPU_SUCCESS( +#ifdef PADDLE_WITH_MUSA + phi::dynload::mudnnRNNBackwardData(handle, +#else phi::dynload::cudnnRNNBackwardData(handle, +#endif rnn.rnn_desc(), seq_length, rnn.y_descs(), @@ -223,7 +227,11 @@ void CudnnLSTMGradKernel( const_cast(reserve_data), reserve_size)); +#ifdef PADDLE_WITH_MUSA + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnRNNBackwardWeights( +#else PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNBackwardWeights( +#endif handle, rnn.rnn_desc(), seq_length, @@ -305,7 +313,7 @@ void CudnnLSTMGradKernel( #ifdef PADDLE_WITH_HIP PD_REGISTER_KERNEL( cudnn_lstm_grad, GPU, ALL_LAYOUT, phi::CudnnLSTMGradKernel, float) {} -#else +#else // CUDA & MUSA PD_REGISTER_KERNEL( cudnn_lstm_grad, GPU, ALL_LAYOUT, phi::CudnnLSTMGradKernel, float, double) { } diff --git a/paddle/phi/kernels/gpu/cudnn_lstm_kernel.cu b/paddle/phi/kernels/gpu/cudnn_lstm_kernel.cu index f3a03727e0bc4..bcc1f1464bed1 100644 --- a/paddle/phi/kernels/gpu/cudnn_lstm_kernel.cu +++ b/paddle/phi/kernels/gpu/cudnn_lstm_kernel.cu @@ -25,6 +25,9 @@ template #ifdef PADDLE_WITH_HIP void LSTMInferece(const bool &has_seq_length, const miopenHandle_t &handle, +#elif defined(PADDLE_WITH_MUSA) +void LSTMInferece(const bool &has_seq_length, + const mudnnHandle_t &handle, #else void LSTMInferece(const bool &has_seq_length, const cudnnHandle_t &handle, @@ -64,6 +67,48 @@ void LSTMInferece(const bool &has_seq_length, last_c_data, workspace_data->data(), workspace_size)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::mudnnRNNForwardInference(handle, + rnn->rnn_desc(), + seq_length, + rnn->x_descs(), + x_data, + rnn->init_h_desc(), + init_h_data, + rnn->init_c_desc(), + init_c_data, + rnn->weight_desc(), + w_data, + rnn->y_descs(), + out_data, + rnn->last_h_desc(), + last_h_data, + rnn->last_c_desc(), + last_c_data, + workspace_data->data(), + workspace_size)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::mudnnRNNForwardInference(handle, + rnn->rnn_desc(), + seq_length, + rnn->x_descs(), + x_data, + rnn->init_h_desc(), + init_h_data, + rnn->init_c_desc(), + init_c_data, + rnn->weight_desc(), + w_data, + rnn->y_descs(), + out_data, + rnn->last_h_desc(), + last_h_data, + rnn->last_c_desc(), + last_c_data, + workspace_data->data(), + workspace_size)); #else PADDLE_ENFORCE_GPU_SUCCESS( phi::dynload::cudnnRNNForwardInference(handle, @@ -293,7 +338,11 @@ void CudnnLSTMKernel( reserve_size)); #else PADDLE_ENFORCE_GPU_SUCCESS( +#ifdef PADDLE_WITH_MUSA + phi::dynload::mudnnRNNForwardTraining(handle, +#else phi::dynload::cudnnRNNForwardTraining(handle, +#endif rnn.rnn_desc(), seq_length, rnn.x_descs(), @@ -366,7 +415,7 @@ PD_REGISTER_KERNEL(cudnn_lstm, GPU, ALL_LAYOUT, phi::CudnnLSTMKernel, float) { kernel->OutputAt(3).SetDataType(phi::DataType::UINT8); kernel->OutputAt(4).SetDataType(phi::DataType::UINT8); } -#else +#else // CUDA & MUSA PD_REGISTER_KERNEL( cudnn_lstm, GPU, ALL_LAYOUT, phi::CudnnLSTMKernel, float, double) { kernel->InputAt(5).SetDataType(phi::DataType::INT32); diff --git a/paddle/phi/kernels/gpu/cudnn_lstm_utils.h b/paddle/phi/kernels/gpu/cudnn_lstm_utils.h index e5fc51849454d..033efe0b9e7b5 100644 --- a/paddle/phi/kernels/gpu/cudnn_lstm_utils.h +++ b/paddle/phi/kernels/gpu/cudnn_lstm_utils.h @@ -26,6 +26,9 @@ #ifdef PADDLE_WITH_HIP #include "paddle/phi/kernels/gpu/miopen_lstm_cache.h" #endif +#ifdef PADDLE_WITH_MUSA +#include "paddle/phi/kernels/gpu/mudnn_lstm_cache.h" +#endif namespace phi { diff --git a/paddle/phi/kernels/gpu/cum_grad_kernel.cu b/paddle/phi/kernels/gpu/cum_grad_kernel.cu index 620d185475ef9..d92dab27c8c15 100644 --- a/paddle/phi/kernels/gpu/cum_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/cum_grad_kernel.cu @@ -63,7 +63,7 @@ void CumsumGradKernel(const Context& dev_ctx, } // namespace phi -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(cumsum_grad, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/gpu/dirichlet_kernel.cu b/paddle/phi/kernels/gpu/dirichlet_kernel.cu index 09d6a402e701a..bed4d840062f7 100644 --- a/paddle/phi/kernels/gpu/dirichlet_kernel.cu +++ b/paddle/phi/kernels/gpu/dirichlet_kernel.cu @@ -25,6 +25,8 @@ #ifdef PADDLE_WITH_CUDA #include +#elif defined(PADDLE_WITH_MUSA) +#include #endif #ifdef PADDLE_WITH_HIP #include @@ -40,6 +42,11 @@ using COMPAT_RANDSTATEPHILOX4_32_10_T = hiprandStatePhilox4_32_10_t; #define COMPAT_RAND_INIT hiprand_init #define COMPAT_RAND_UNIFORM hiprand_uniform #define COMPAT_RAND_NORMAL hiprand_normal +#elif defined(PADDLE_WITH_MUSA) +using COMPAT_RANDSTATEPHILOX4_32_10_T = murand_state_philox4x32_10_t; +#define COMPAT_RAND_INIT murand_init +#define COMPAT_RAND_UNIFORM murand_uniform +#define COMPAT_RAND_NORMAL murand_normal #endif namespace phi { diff --git a/paddle/phi/kernels/gpu/embedding_grad_kernel.cu b/paddle/phi/kernels/gpu/embedding_grad_kernel.cu index 99ba12b1d6213..5fdf63083896e 100644 --- a/paddle/phi/kernels/gpu/embedding_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/embedding_grad_kernel.cu @@ -99,6 +99,9 @@ struct EmbeddingGradCUDAFunctor { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx_.stream())); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx_.stream())); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx_.stream())); diff --git a/paddle/phi/kernels/gpu/graph_reindex_kernel.cu b/paddle/phi/kernels/gpu/graph_reindex_kernel.cu index ac0dea5165379..966d018feb97f 100644 --- a/paddle/phi/kernels/gpu/graph_reindex_kernel.cu +++ b/paddle/phi/kernels/gpu/graph_reindex_kernel.cu @@ -61,7 +61,7 @@ std::shared_ptr FillHashTable(const Context& dev_ctx, #ifdef PADDLE_WITH_HIP int block = 256; #else - int block = 1024; + int block = 1024; // CUDA & MUSA #endif int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0]; int grid_tmp = (num_input + block - 1) / block; @@ -76,6 +76,8 @@ std::shared_ptr FillHashTable(const Context& dev_ctx, int* item_count_ptr = reinterpret_cast(item_count->ptr()); #ifdef PADDLE_WITH_HIP hipMemset(item_count_ptr, 0, sizeof(int) * (num_input + 1)); +#elif defined(PADDLE_WITH_MUSA) + musaMemset(item_count_ptr, 0, sizeof(int) * (num_input + 1)); #else cudaMemset(item_count_ptr, 0, sizeof(int) * (num_input + 1)); #endif @@ -97,6 +99,11 @@ std::shared_ptr FillHashTable(const Context& dev_ctx, item_count_ptr + num_input, sizeof(int), hipMemcpyDeviceToHost); +#elif defined(PADDLE_WITH_MUSA) + musaMemcpy(&total_unique_items, + item_count_ptr + num_input, + sizeof(int), + musaMemcpyDeviceToHost); #else cudaMemcpy(&total_unique_items, item_count_ptr + num_input, @@ -131,7 +138,7 @@ void FillBufferHashTable(const Context& dev_ctx, #ifdef PADDLE_WITH_HIP int block = 256; #else - int block = 1024; + int block = 1024; // CUDA & MUSA #endif int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0]; int grid_tmp = (num_input + block - 1) / block; @@ -170,7 +177,7 @@ void ResetBufferHashTable(const Context& dev_ctx, #ifdef PADDLE_WITH_HIP int block = 256; #else - int block = 1024; + int block = 1024; // CUDA & MUSA #endif int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0]; int grid_tmp = (unique_items->size() + block - 1) / block; @@ -193,7 +200,7 @@ void ReindexSrc(const Context& dev_ctx, #ifdef PADDLE_WITH_HIP int block = 256; #else - int block = 1024; + int block = 1024; // CUDA & MUSA #endif int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0]; int grid_tmp = (num_edges + block - 1) / block; @@ -293,7 +300,7 @@ void BufferReindex(const Context& dev_ctx, #ifdef PADDLE_WITH_HIP int block = 256; #else - int block = 1024; + int block = 1024; // CUDA & MUSA #endif int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0]; int grid_tmp = (num_edges + block - 1) / block; @@ -364,6 +371,11 @@ void ReindexDst(const Context& dev_ctx, thrust::raw_pointer_cast(dst_ptr.data()) + node_len, sizeof(int), hipMemcpyDeviceToHost); +#elif defined(PADDLE_WITH_MUSA) + musaMemcpy(&count_i, + thrust::raw_pointer_cast(dst_ptr.data()) + node_len, + sizeof(int), + musaMemcpyDeviceToHost); #else cudaMemcpy(&count_i, thrust::raw_pointer_cast(dst_ptr.data()) + node_len, diff --git a/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu b/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu index c01a8ea9d2e01..416352d5cb6ea 100644 --- a/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu +++ b/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu @@ -22,6 +22,9 @@ #ifdef PADDLE_WITH_HIP #include #include +#elif defined(PADDLE_WITH_MUSA) +#include +#include #else #include #include @@ -82,6 +85,12 @@ __global__ void SampleKernel(const uint64_t rand_seed, threadIdx.y * CTA_SIZE + threadIdx.x, 0, &rng); +#elif defined(PADDLE_WITH_MUSA) + murand_state_philox4x32_10 rng; + murand_init(rand_seed * gridDim.x + blockIdx.x, + threadIdx.y * CTA_SIZE + threadIdx.x, + 0, + &rng); #else curandStatePhilox4_32_10_t rng; curand_init(rand_seed * gridDim.x + blockIdx.x, @@ -118,6 +127,8 @@ __global__ void SampleKernel(const uint64_t rand_seed, for (int idx = k + threadIdx.x; idx < deg; idx += CTA_SIZE) { #ifdef PADDLE_WITH_HIP const int num = hiprand(&rng) % (idx + 1); +#elif defined(PADDLE_WITH_MUSA) + const int num = murand(&rng) % (idx + 1); #else const int num = curand(&rng) % (idx + 1); #endif @@ -218,6 +229,10 @@ __global__ void FisherYatesSampleKernel(const uint64_t rand_seed, hiprandState rng; hiprand_init( rand_seed * gridDim.x + blockIdx.x, threadIdx.y + threadIdx.x, 0, &rng); +#elif defined(PADDLE_WITH_MUSA) + murand_state_philox4_32_10 rng; + murand_init( + rand_seed * gridDim.x + blockIdx.x, threadIdx.y + threadIdx.x, 0, &rng); #else curandStatePhilox4_32_10_t rng; curand_init( @@ -242,6 +257,8 @@ __global__ void FisherYatesSampleKernel(const uint64_t rand_seed, for (int idx = split + threadIdx.x; idx <= deg - 1; idx += CTA_SIZE) { #ifdef PADDLE_WITH_HIP const int num = hiprand(&rng) % (idx + 1); +#elif defined(PADDLE_WITH_MUSA) + const int num = murand(&rng) % (idx + 1); #else const int num = curand(&rng) % (idx + 1); #endif diff --git a/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h b/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h index bff91078865d9..3c2e4fa856a6a 100644 --- a/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h +++ b/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h @@ -42,6 +42,15 @@ inline void CopyBCastOff(const BroadCastInfo& bcast_info, bcast_info.r_offset.data(), sizeof(int64_t) * bcast_info.out_len, hipMemcpyHostToDevice); +#elif defined(PADDLE_WITH_MUSA) + musaMemcpy(thrust::raw_pointer_cast(l_bcastoff->data()), + bcast_info.l_offset.data(), + sizeof(int64_t) * bcast_info.out_len, + musaMemcpyHostToDevice); + musaMemcpy(thrust::raw_pointer_cast(r_bcastoff->data()), + bcast_info.r_offset.data(), + sizeof(int64_t) * bcast_info.out_len, + musaMemcpyHostToDevice); #else cudaMemcpy(thrust::raw_pointer_cast(l_bcastoff->data()), bcast_info.l_offset.data(), diff --git a/paddle/phi/kernels/gpu/group_norm_kernel.cu b/paddle/phi/kernels/gpu/group_norm_kernel.cu index ef39abd939410..5b0dda3030cf1 100644 --- a/paddle/phi/kernels/gpu/group_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/group_norm_kernel.cu @@ -300,6 +300,9 @@ void GroupNormDirectCUDAFunctor::operator()( #ifdef PADDLE_WITH_HIP hipMemset(mean, 0, sizeof(AccT) * input_ddim[0] * groups); hipMemset(temp_variance, 0, sizeof(AccT) * input_ddim[0] * groups); +#elif defined(PADDLE_WITH_MUSA) + musaMemset(mean, 0, sizeof(AccT) * input_ddim[0] * groups); + musaMemset(temp_variance, 0, sizeof(AccT) * input_ddim[0] * groups); #else cudaMemset(mean, 0, sizeof(AccT) * input_ddim[0] * groups); cudaMemset(temp_variance, 0, sizeof(AccT) * input_ddim[0] * groups); diff --git a/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu index 0f17a1bcc318a..14be4ee79d142 100644 --- a/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu @@ -401,6 +401,14 @@ void InstanceNormGradKernel(const Context &dev_ctx, phi::dynload::miopenCreateTensorDescriptor(&data_desc_)); PADDLE_ENFORCE_GPU_SUCCESS( phi::dynload::miopenCreateTensorDescriptor(&in_param_desc_)); +#elif defined(PADDLE_WITH_MUSA) + mudnnTensorDescriptor_t data_desc_; + mudnnTensorDescriptor_t in_param_desc_; + + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::mudnnCreateTensorDescriptor(&data_desc_)); + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::mudnnCreateTensorDescriptor(&in_param_desc_)); #else cudnnTensorDescriptor_t data_desc_; cudnnTensorDescriptor_t in_param_desc_; @@ -427,6 +435,15 @@ void InstanceNormGradKernel(const Context &dev_ctx, const_cast(strides.data()))); PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDeriveBNTensorDescriptor( in_param_desc_, data_desc_, miopenBNSpatial)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnSetTensorDescriptor( + data_desc_, + CudnnDataType::type, + x_dims.size() > 3 ? x_dims.size() : 4, + const_cast(dims.data()), + const_cast(strides.data()))); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnDeriveBNTensorDescriptor( + in_param_desc_, data_desc_, miopenBNSpatial)); #else PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensorNdDescriptor( data_desc_, @@ -464,9 +481,14 @@ void InstanceNormGradKernel(const Context &dev_ctx, epsilon, saved_mean_data, saved_var_data)); +#else +#ifdef PADDLE_WITH_MUSA + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnBatchNormalizationBackward( + dev_ctx.mudnn_handle(), #else PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnBatchNormalizationBackward( dev_ctx.cudnn_handle(), +#endif CUDNN_BATCHNORM_SPATIAL, CudnnDataType::kOne(), CudnnDataType::kZero(), @@ -511,6 +533,11 @@ void InstanceNormGradKernel(const Context &dev_ctx, phi::dynload::miopenDestroyTensorDescriptor(data_desc_)); PADDLE_ENFORCE_GPU_SUCCESS( phi::dynload::miopenDestroyTensorDescriptor(in_param_desc_)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::mudnnDestroyTensorDescriptor(data_desc_)); + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::mudnnDestroyTensorDescriptor(in_param_desc_)); #else PADDLE_ENFORCE_GPU_SUCCESS( phi::dynload::cudnnDestroyTensorDescriptor(data_desc_)); @@ -659,7 +686,7 @@ PD_REGISTER_KERNEL(instance_norm_double_grad, double, phi::dtype::float16, phi::dtype::bfloat16) {} -#else +#else // CUDA & MUSA PD_REGISTER_KERNEL(instance_norm_grad, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/gpu/instance_norm_kernel.cu b/paddle/phi/kernels/gpu/instance_norm_kernel.cu index 7f10eac67c67c..51339ea33d36b 100644 --- a/paddle/phi/kernels/gpu/instance_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/instance_norm_kernel.cu @@ -69,6 +69,14 @@ void InstanceNormKernel(const Context &dev_ctx, phi::dynload::miopenCreateTensorDescriptor(&data_desc_)); PADDLE_ENFORCE_GPU_SUCCESS( phi::dynload::miopenCreateTensorDescriptor(&in_param_desc_)); +#elif defined(PADDLE_WITH_MUSA) + mudnnTensorDescriptor_t data_desc_; + mudnnTensorDescriptor_t in_param_desc_; + + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::mudnnCreateTensorDescriptor(&data_desc_)); + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::mudnnCreateTensorDescriptor(&in_param_desc_)); #else cudnnTensorDescriptor_t data_desc_; cudnnTensorDescriptor_t in_param_desc_; @@ -100,6 +108,15 @@ void InstanceNormKernel(const Context &dev_ctx, const_cast(strides.data()))); PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDeriveBNTensorDescriptor( in_param_desc_, data_desc_, miopenBNSpatial)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnSetTensorDescriptor( + data_desc_, + CudnnDataType::type, + x_dims.size() > 3 ? x_dims.size() : 4, + const_cast(dims.data()), + const_cast(strides.data()))); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnDeriveBNTensorDescriptor( + in_param_desc_, data_desc_, mudnnBNSpatial)); #else PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensorNdDescriptor( data_desc_, @@ -198,7 +215,11 @@ void InstanceNormKernel(const Context &dev_ctx, phi::dynload::miopenDestroyTensorDescriptor(in_param_desc_)); #else PADDLE_ENFORCE_GPU_SUCCESS( +#ifdef PADDLE_WITH_MUSA + phi::dynload::mudnnBatchNormalizationForwardTraining( +#else phi::dynload::cudnnBatchNormalizationForwardTraining( +#endif handle, CUDNN_BATCHNORM_SPATIAL, CudnnDataType::kOne(), @@ -217,11 +238,18 @@ void InstanceNormKernel(const Context &dev_ctx, saved_mean_data, saved_variance_data)); +#ifdef PADDLE_WITH_MUSA + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::mudnnDestroyTensorDescriptor(data_desc_)); + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::mudnnDestroyTensorDescriptor(in_param_desc_)); +#else PADDLE_ENFORCE_GPU_SUCCESS( phi::dynload::cudnnDestroyTensorDescriptor(data_desc_)); PADDLE_ENFORCE_GPU_SUCCESS( phi::dynload::cudnnDestroyTensorDescriptor(in_param_desc_)); #endif +#endif } } // namespace phi @@ -243,7 +271,7 @@ PD_REGISTER_KERNEL(instance_norm, double, phi::dtype::float16, phi::dtype::bfloat16) {} -#else +#else // CUDA & MUSA PD_REGISTER_KERNEL(instance_norm, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu index e8fc640cdd508..e1c660e674427 100644 --- a/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu @@ -137,7 +137,7 @@ PD_REGISTER_KERNEL(layer_norm_grad, kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); } } -#else +#else // CUDA & MUSA PD_REGISTER_KERNEL(layer_norm_grad, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/gpu/layer_norm_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_kernel.cu index 34425d8cfcfe2..336a655d9c8fa 100644 --- a/paddle/phi/kernels/gpu/layer_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/layer_norm_kernel.cu @@ -482,7 +482,7 @@ void LayerNormDirectCUDAFunctor::operator()(gpuStream_t stream, template class LayerNormDirectCUDAFunctor; template class LayerNormDirectCUDAFunctor; -#if defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_MUSA)) && !defined(PADDLE_WITH_HIP) template class LayerNormDirectCUDAFunctor; #endif @@ -689,7 +689,7 @@ PD_REGISTER_KERNEL(layer_norm, kernel->OutputAt(1).SetDataType(phi::DataType::UNDEFINED); kernel->OutputAt(2).SetDataType(phi::DataType::UNDEFINED); } -#else +#else // CUDA & MUSA PD_REGISTER_KERNEL(layer_norm, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu index f6a5b26960a62..1b0bfaea403c3 100644 --- a/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu @@ -48,6 +48,13 @@ PD_REGISTER_KERNEL(log_softmax_grad, float, phi::dtype::float16, phi::dtype::bfloat16) {} +#elif defined(PADDLE_WITH_MUSA) +PD_REGISTER_KERNEL(log_softmax_grad, + GPU, + ALL_LAYOUT, + phi::LogSoftmaxGradKernel, + float, + phi::dtype::float16) {} #else PD_REGISTER_KERNEL(log_softmax_grad, GPU, diff --git a/paddle/phi/kernels/gpu/log_softmax_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_kernel.cu index 6dfe3d2b6173d..7b6ffe2d0cfd4 100644 --- a/paddle/phi/kernels/gpu/log_softmax_kernel.cu +++ b/paddle/phi/kernels/gpu/log_softmax_kernel.cu @@ -47,6 +47,13 @@ PD_REGISTER_KERNEL(log_softmax, float, phi::dtype::float16, phi::dtype::bfloat16) {} +#elif defined(PADDLE_WITH_MUSA) +PD_REGISTER_KERNEL(log_softmax, + GPU, + ALL_LAYOUT, + phi::LogSoftmaxKernel, + float, + phi::dtype::float16) {} #else PD_REGISTER_KERNEL(log_softmax, GPU, diff --git a/paddle/phi/kernels/gpu/logcumsumexp_grad_kernel.cu b/paddle/phi/kernels/gpu/logcumsumexp_grad_kernel.cu index 4f4ee36892d62..f02f47edc4e28 100644 --- a/paddle/phi/kernels/gpu/logcumsumexp_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/logcumsumexp_grad_kernel.cu @@ -20,7 +20,7 @@ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/logcumsumexp_grad_impl.h" -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(logcumsumexp_grad, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/gpu/logsumexp_function.cu.h b/paddle/phi/kernels/gpu/logsumexp_function.cu.h index 53b6fb6d2b20d..76d0b294f397b 100644 --- a/paddle/phi/kernels/gpu/logsumexp_function.cu.h +++ b/paddle/phi/kernels/gpu/logsumexp_function.cu.h @@ -46,7 +46,7 @@ __inline__ __device__ T WarpAllReduce(T val) { for (int mask = ThreadGroupWidth / 2; mask > 0; mask /= 2) { #if PADDLE_WITH_HIP val = Functor()(val, __shfl_xor(0xffffffff, val, mask)); -#else +#else // CUDA & MUSA val = Functor()(val, __shfl_xor_sync(0xffffffff, val, mask)); #endif } @@ -69,6 +69,22 @@ inline void GetNumBlocks(int64_t block_size, *num_blocks = std::max( 1, std::min(max_blocks, sm_count * tpm / block_size * waves)); } +#elif defined(PADDLE_WITH_MUSA) +inline void GetNumBlocks(int64_t block_size, + int64_t max_blocks, + int64_t waves, + int* num_blocks) { + int dev; + PADDLE_ENFORCE_GPU_SUCCESS(musaGetDevice(&dev)); + int sm_count; + PADDLE_ENFORCE_GPU_SUCCESS( + cudaDeviceGetAttribute(&sm_count, musaDevAttrMultiProcessorCount, dev)); + int tpm; + PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceGetAttribute( + &tpm, musaDevAttrMaxThreadsPerMultiProcessor, dev)); + *num_blocks = std::max( + 1, std::min(max_blocks, sm_count * tpm / block_size * waves)); +} #else inline void GetNumBlocks(int64_t block_size, int64_t max_blocks, @@ -193,6 +209,12 @@ inline hipError_t LaunchLogsumexpWarp(const Context& dev_ctx, const int64_t num_col, const SourceType* in, SourceType* out) { +#elif defined(PADDLE_WITH_MUSA) +inline musaError_t LaunchLogsumexpWarp(const Context& dev_ctx, + const int64_t num_row, + const int64_t num_col, + const SourceType* in, + SourceType* out) { #else inline cudaError_t LaunchLogsumexpWarp(const Context& dev_ctx, const int64_t num_row, @@ -222,6 +244,8 @@ inline cudaError_t LaunchLogsumexpWarp(const Context& dev_ctx, dev_ctx, num_row, num_col, in, out); #if PADDLE_WITH_HIP return hipPeekAtLastError(); +#elif defined(PADDLE_WITH_MUSA) + return musaPeekAtLastError(); #else return cudaPeekAtLastError(); #endif @@ -240,6 +264,12 @@ inline hipError_t DispatchLogsumexpWarpWithPadding(const Context& dev_ctx, const int64_t num_col, const SourceType* in, SourceType* out) { +#elif defined(PADDLE_WITH_MUSA) +inline musaError_t DispatchLogsumexpWarpWithPadding(const Context& dev_ctx, + const int64_t num_row, + const int64_t num_col, + const SourceType* in, + SourceType* out) { #else inline cudaError_t DispatchLogsumexpWarpWithPadding(const Context& dev_ctx, const int64_t num_row, @@ -287,6 +317,8 @@ DispatchLogsumexpWarpCols(const Context& dev_ctx, if (num_col <= 0) { #if PADDLE_WITH_HIP return hipErrorInvalidValue; +#elif defined(PADDLE_WITH_MUSA) + return musaErrorInvalidValue; #else return cudaErrorInvalidValue; #endif @@ -367,6 +399,8 @@ DispatchLogsumexpWarpCols(const Context& dev_ctx, #undef HANDLE_COL #if PADDLE_WITH_HIP return hipErrorInvalidValue; +#elif defined(PADDLE_WITH_MUSA) + return musaErrorInvalidValue; #else return cudaErrorInvalidValue; #endif @@ -391,6 +425,8 @@ DispatchLogsumexpWarpCols(const Context& dev_ctx, if (num_col <= 0) { #if PADDLE_WITH_HIP return hipErrorInvalidValue; +#elif defined(PADDLE_WITH_MUSA) + return musaErrorInvalidValue; #else return cudaErrorInvalidValue; #endif @@ -455,6 +491,8 @@ DispatchLogsumexpWarpCols(const Context& dev_ctx, #undef HANDLE_COL #if PADDLE_WITH_HIP return hipErrorInvalidValue; +#elif defined(PADDLE_WITH_MUSA) + return musaErrorInvalidValue; #else return cudaErrorInvalidValue; #endif diff --git a/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu b/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu index 7895983236f91..4e5a2942d6b3b 100644 --- a/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu @@ -36,6 +36,8 @@ void NllLossGradKernel(const Context& dev_ctx, auto total_weight_data = total_weight.data(); #ifdef PADDLE_WITH_HIP hipMemset(dx_data, 0, dx->numel() * sizeof(T)); +#elif defined(PADDLE_WITH_MUSA) + musaMemset(dx_data, 0, dx->numel() * sizeof(T)); #else cudaMemset(dx_data, 0, dx->numel() * sizeof(T)); #endif diff --git a/paddle/phi/kernels/gpu/nll_loss_kernel.cu b/paddle/phi/kernels/gpu/nll_loss_kernel.cu index 1e80eb9bb460e..5d9aec594089d 100644 --- a/paddle/phi/kernels/gpu/nll_loss_kernel.cu +++ b/paddle/phi/kernels/gpu/nll_loss_kernel.cu @@ -37,6 +37,8 @@ void NllLossRawKernel(const Context& dev_ctx, auto weight_data = weight.get_ptr() ? weight.get_ptr()->data() : nullptr; #ifdef PADDLE_WITH_HIP hipMemset(total_weight_data, 0, sizeof(T)); +#elif defined(PADDLE_WITH_MUSA) + musaMemset(total_weight_data, 0, sizeof(T)); #else cudaMemset(total_weight_data, 0, sizeof(T)); #endif diff --git a/paddle/phi/kernels/gpu/rnn_functor.h b/paddle/phi/kernels/gpu/rnn_functor.h index fc27258981d39..e351c29138ee8 100644 --- a/paddle/phi/kernels/gpu/rnn_functor.h +++ b/paddle/phi/kernels/gpu/rnn_functor.h @@ -25,6 +25,10 @@ namespace phi { using gpuRNNMode_t = miopenRNNMode_t; using gpuDnnHandle_t = miopenHandle_t; using gpuDnnDataType_t = miopenDataType_t; +#elif defined(PADDLE_WITH_MUSA) +using gpuRNNMode_t = mudnnRNNMode_t; +using gpuDnnHandle_t = mudnnHandle_t; +using gpuDnnDataType_t = mudnnDataType_t; #else using gpuRNNMode_t = cudnnRNNMode_t; using gpuDnnHandle_t = cudnnHandle_t; @@ -103,6 +107,9 @@ class RNNDescriptors { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( phi::dynload::miopenDropoutGetStatesSize(handle, &state_size)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::mudnnDropoutGetStatesSize(handle, &state_size)); #else PADDLE_ENFORCE_GPU_SUCCESS( phi::dynload::cudnnDropoutGetStatesSize(handle, &state_size)); @@ -143,8 +150,12 @@ class RNNDescriptors { mode_, CUDNN_RNN_ALGO_STANDARD, cudnn_type)); +#else +#ifdef PADDLE_WITH_MUSA + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnSetRNNDescriptor( #else PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetRNNDescriptor( +#endif rnn_desc_.desc(), hidden_size_, num_layers_, @@ -167,6 +178,9 @@ class RNNDescriptors { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenGetRNNParamsSize( handle, rnn_desc_.desc(), x_descs_[0], &weights_size_, cudnn_type)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnGetRNNParamsSize( + handle, rnn_desc_.desc(), x_descs_[0], &weights_size_, cudnn_type)); #else PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnGetRNNParamsSize( handle, rnn_desc_.desc(), x_descs_[0], &weights_size_, cudnn_type)); @@ -191,6 +205,15 @@ class RNNDescriptors { workspace_size)); PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenGetRNNTrainingReserveSize( handle, rnn_desc_.desc(), seq_length_, x_descs_.data(), reserve_size)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::mudnnGetRNNWorkspaceSize(handle, + rnn_desc_.desc(), + seq_length_, + x_descs_.data(), + workspace_size)); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnGetRNNTrainingReserveSize( + handle, rnn_desc_.desc(), seq_length_, x_descs_.data(), reserve_size)); #else PADDLE_ENFORCE_GPU_SUCCESS( phi::dynload::cudnnGetRNNWorkspaceSize(handle, @@ -212,6 +235,16 @@ class RNNDescriptors { miopenRNNDescriptor_t rnn_desc() { return rnn_desc_.desc(); } miopenDropoutDescriptor_t dropout_desc() { return dropout_desc_.desc(); } miopenTensorDescriptor_t weight_desc() { return weight_desc_.desc(); } +#elif defined(PADDLE_WITH_MUSA) + mudnnTensorDescriptor_t *x_descs() { return x_descs_.data(); } + mudnnTensorDescriptor_t *y_descs() { return y_descs_.data(); } + mudnnTensorDescriptor_t init_h_desc() { return init_h_desc_.desc(); } + mudnnTensorDescriptor_t init_c_desc() { return init_c_desc_.desc(); } + mudnnTensorDescriptor_t last_h_desc() { return last_h_desc_.desc(); } + mudnnTensorDescriptor_t last_c_desc() { return last_c_desc_.desc(); } + mudnnRNNDescriptor_t rnn_desc() { return rnn_desc_.desc(); } + mudnnDropoutDescriptor_t dropout_desc() { return dropout_desc_.desc(); } + mudnnTensorDescriptor_t weight_desc() { return weight_desc_.desc(); } #else cudnnTensorDescriptor_t *x_descs() { return x_descs_.data(); } cudnnTensorDescriptor_t *y_descs() { return y_descs_.data(); } @@ -243,6 +276,9 @@ class RNNDescriptors { #ifdef PADDLE_WITH_HIP std::vector x_descs_; std::vector y_descs_; +#elif defined(PADDLE_WITH_HIP) + std::vector x_descs_; + std::vector y_descs_; #else std::vector x_descs_; std::vector y_descs_; diff --git a/paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc b/paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc index 910c8e8b6a57a..44bca2124770a 100644 --- a/paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc +++ b/paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc @@ -105,6 +105,16 @@ void RnnGradKernel(const Context &dev_ctx, rnn_mode = miopenRNNRELU; else if (mode == "RNN_TANH") rnn_mode = miopenRNNTANH; +#elif defined(PADDLE_WITH_MUSA) + mudnnRNNMode_t rnn_mode = MUDNN_LSTM; + if (mode == "LSTM") + rnn_mode = MUDNN_LSTM; + else if (mode == "GRU") + rnn_mode = MUDNN_GRU; + else if (mode == "RNN_RELU") + rnn_mode = MUDNN_RNN_RELU; + else if (mode == "RNN_TANH") + rnn_mode = MUDNN_RNN_TANH; #else cudnnRNNMode_t rnn_mode = CUDNN_LSTM; if (mode == "LSTM") @@ -195,6 +205,8 @@ void RnnGradKernel(const Context &dev_ctx, T *init_c_grad_data = nullptr; #ifdef PADDLE_WITH_HIP if (rnn_mode == miopenLSTM) { +#elif defined(PADDLE_WITH_MUSA) + if (rnn_mode == MUDNN_LSTM) #else if (rnn_mode == CUDNN_LSTM) { #endif @@ -341,8 +353,12 @@ void RnnGradKernel(const Context &dev_ctx, // permute weight grad list from weight grad tensor TensorToPermutedWeight( place, stream, weight_grad, &weight_grad_list, rnn_mode, is_bidirec); +#else +#ifdef PADDLE_WITH_MUSA + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnRNNBackwardWeights( #else PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNBackwardWeights( +#endif handle, rnn.rnn_desc(), seq_length, diff --git a/paddle/phi/kernels/gpu/rnn_kernel.cu.cc b/paddle/phi/kernels/gpu/rnn_kernel.cu.cc index c1ed3f16e0584..601c1a524c402 100644 --- a/paddle/phi/kernels/gpu/rnn_kernel.cu.cc +++ b/paddle/phi/kernels/gpu/rnn_kernel.cu.cc @@ -65,7 +65,11 @@ void RNNInferece(bool has_seq_length, workspace_size)); #else PADDLE_ENFORCE_GPU_SUCCESS( +#ifdef PADDLE_WITH_MUSA + phi::dynload::mudnnRNNForwardInference(handle, +#else phi::dynload::cudnnRNNForwardInference(handle, +#endif rnn->rnn_desc(), seq_length, rnn->x_descs(), @@ -154,6 +158,16 @@ void RnnKernel(const Context &dev_ctx, rnn_mode = miopenRNNRELU; else if (mode == "RNN_TANH") rnn_mode = miopenRNNTANH; +#elif defined(PADDLE_WITH_MUSA) + gpuRNNMode_t rnn_mode = MUDNN_LSTM; + if (mode == "LSTM") + rnn_mode = MUDNN_LSTM; + else if (mode == "GRU") + rnn_mode = MUDNN_GRU; + else if (mode == "RNN_RELU") + rnn_mode = MUDNN_RNN_RELU; + else if (mode == "RNN_TANH") + rnn_mode = MUDNN_RNN_TANH; #else gpuRNNMode_t rnn_mode = CUDNN_LSTM; if (mode == "LSTM") @@ -188,6 +202,8 @@ void RnnKernel(const Context &dev_ctx, T *last_c_data = nullptr; #ifdef PADDLE_WITH_HIP if (rnn_mode == miopenLSTM) { +#elif defined(PADDLE_WITH_MUSA) + if (rnn_mode == MUDNN_LSTM) { #else if (rnn_mode == CUDNN_LSTM) { #endif @@ -333,7 +349,11 @@ void RnnKernel(const Context &dev_ctx, reserve_size)); #else PADDLE_ENFORCE_GPU_SUCCESS( +#ifdef PADDLE_WITH_MUSA + phi::dynload::mudnnRNNForwardTraining(handle, +#else phi::dynload::cudnnRNNForwardTraining(handle, +#endif rnn.rnn_desc(), seq_length, rnn.x_descs(), @@ -405,7 +425,7 @@ void RnnKernel(const Context &dev_ctx, PD_REGISTER_KERNEL(rnn, GPU, ALL_LAYOUT, phi::RnnKernel, float) { kernel->OutputAt(1).SetDataType(phi::DataType::UINT8); } -#else +#else // CUDA & MUSA PD_REGISTER_KERNEL(rnn, GPU, ALL_LAYOUT, phi::RnnKernel, float, double) { kernel->OutputAt(1).SetDataType(phi::DataType::UINT8); } diff --git a/paddle/phi/kernels/gpu/send_u_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/send_u_recv_grad_kernel.cu index a7e4e32ed1d17..58cf7a273f540 100644 --- a/paddle/phi/kernels/gpu/send_u_recv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/send_u_recv_grad_kernel.cu @@ -49,6 +49,8 @@ void GraphSendRecvGradOpCUDAKernelLaunchHelper( #ifdef PADDLE_WITH_HIP hipMemset(p_output, 0, memset_bytes); +#elif defined(PADDLE_WITH_MUSA) + musaMemset(p_output, 0, memset_bytes); #else cudaMemset(p_output, 0, memset_bytes); #endif diff --git a/paddle/phi/kernels/gpu/send_u_recv_kernel.cu b/paddle/phi/kernels/gpu/send_u_recv_kernel.cu index 85cc80e36b517..3aa20279bdd29 100644 --- a/paddle/phi/kernels/gpu/send_u_recv_kernel.cu +++ b/paddle/phi/kernels/gpu/send_u_recv_kernel.cu @@ -63,6 +63,8 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx, if (reduce_op == "SUM" || reduce_op == "MEAN") { #ifdef PADDLE_WITH_HIP hipMemset(p_output, 0, memset_bytes); +#elif defined(PADDLE_WITH_MUSA) + musaMemset(p_output, 0, memset_bytes); #else cudaMemset(p_output, 0, memset_bytes); #endif @@ -138,6 +140,8 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx, #ifdef PADDLE_WITH_HIP hipMemset(p_dst_count, 0, input_size * sizeof(int)); +#elif defined(PADDLE_WITH_MUSA) + musaMemset(p_dst_count, 0, input_size * sizeof(int)); #else cudaMemset(p_dst_count, 0, input_size * sizeof(int)); #endif diff --git a/paddle/phi/kernels/gpu/send_ue_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/send_ue_recv_grad_kernel.cu index d368c43a29753..a1c2a0dcf2214 100644 --- a/paddle/phi/kernels/gpu/send_ue_recv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/send_ue_recv_grad_kernel.cu @@ -166,6 +166,11 @@ void CalculateXGrad(const Context& ctx, x_grad_out.data(), x_grad_out.numel() * sizeof(T), hipMemcpyDeviceToDevice); +#elif defined(PADDLE_WITH_MUSA) + musaMemcpy(x_grad, + x_grad_out.data(), + x_grad_out.numel() * sizeof(T), + musaMemcpyDeviceToDevice); #else cudaMemcpy(x_grad, x_grad_out.data(), @@ -243,6 +248,11 @@ void CalculateXGrad(const Context& ctx, x_grad_out.data(), x_grad_out.numel() * sizeof(T), hipMemcpyDeviceToDevice); +#elif defined(PADDLE_WITH_MUSA) + musaMemcpy(x_grad, + x_grad_out.data(), + x_grad_out.numel() * sizeof(T), + musaMemcpyDeviceToDevice); #else cudaMemcpy(x_grad, x_grad_out.data(), @@ -289,6 +299,11 @@ void CalculateXGrad(const Context& ctx, x_grad_out.data(), x_grad_out.numel() * sizeof(T), hipMemcpyDeviceToDevice); +#elif defined(PADDLE_WITH_MUSA) + musaMemcpy(x_grad, + x_grad_out.data(), + x_grad_out.numel() * sizeof(T), + musaMemcpyDeviceToDevice); #else cudaMemcpy(x_grad, x_grad_out.data(), @@ -358,6 +373,11 @@ void CalculateXGrad(const Context& ctx, x_grad_out.data(), x_grad_out.numel() * sizeof(T), hipMemcpyDeviceToDevice); +#elif defined(PADDLE_WITH_MUSA) + musaMemcpy(x_grad, + x_grad_out.data(), + x_grad_out.numel() * sizeof(T), + musaMemcpyDeviceToDevice); #else cudaMemcpy(x_grad, x_grad_out.data(), @@ -493,6 +513,9 @@ void GraphSendUERecvGradOpCUDAKernelLaunchHelper( #ifdef PADDLE_WITH_HIP hipMemset(x_grad_data, 0, memset_bytes_x); hipMemset(e_grad_data, 0, memset_bytes_e); +#elif defined(PADDLE_WITH_MUSA) + musaMemset(x_grad_data, 0, memset_bytes_x); + musaMemset(e_grad_data, 0, memset_bytes_e); #else cudaMemset(x_grad_data, 0, memset_bytes_x); cudaMemset(e_grad_data, 0, memset_bytes_e); diff --git a/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu b/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu index 764490bd1cb8b..33f7cbccd0f5e 100644 --- a/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu +++ b/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu @@ -61,6 +61,8 @@ void GraphSendUERecvOpCUDAKernelLaunchHelper(const Context& ctx, if (reduce_op == "SUM" || reduce_op == "MEAN") { #ifdef PADDLE_WITH_HIP hipMemset(out_data, 0, memset_bytes); +#elif defined(PADDLE_WITH_MUSA) + musaMemset(out_data, 0, memset_bytes); #else cudaMemset(out_data, 0, memset_bytes); #endif @@ -104,7 +106,7 @@ void GraphSendUERecvOpCUDAKernelLaunchHelper(const Context& ctx, #ifdef PADDLE_WITH_HIP int block_ = 256; #else - int block_ = 1024; + int block_ = 1024; // CUDA & MUSA #endif if (reduce_op == "SUM" || reduce_op == "MEAN") { GraphSendUERecvSumCUDAFunctor sum_functor; @@ -158,6 +160,8 @@ void GraphSendUERecvOpCUDAKernelLaunchHelper(const Context& ctx, int* dst_count_data = dst_count->data(); #ifdef PADDLE_WITH_HIP hipMemset(dst_count_data, 0, input_size * sizeof(int)); +#elif defined(PADDLE_WITH_HIP) + musaMemset(dst_count_data, 0, input_size * sizeof(int)); #else cudaMemset(dst_count_data, 0, input_size * sizeof(int)); #endif diff --git a/paddle/phi/kernels/gpu/send_uv_grad_kernel.cu b/paddle/phi/kernels/gpu/send_uv_grad_kernel.cu index c50b1960d0056..408f4bf26593c 100644 --- a/paddle/phi/kernels/gpu/send_uv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/send_uv_grad_kernel.cu @@ -116,6 +116,11 @@ void CalculateGrad(const Context& ctx, x_grad_out.data(), x_grad_out.numel() * sizeof(T), hipMemcpyDeviceToDevice); +#elif defined(PADDLE_WITH_MUSA) + musaMemcpy(x_grad, + x_grad_out.data(), + x_grad_out.numel() * sizeof(T), + musaMemcpyDeviceToDevice); #else cudaMemcpy(x_grad, x_grad_out.data(), @@ -198,6 +203,11 @@ void CalculateGrad(const Context& ctx, x_grad_out.data(), x_grad_out.numel() * sizeof(T), hipMemcpyDeviceToDevice); +#elif defined(PADDLE_WITH_MUSA) + musaMemcpy(x_grad, + x_grad_out.data(), + x_grad_out.numel() * sizeof(T), + musaMemcpyDeviceToDevice); #else cudaMemcpy(x_grad, x_grad_out.data(), @@ -247,6 +257,9 @@ void GraphSendUVGradOpCUDAKernelLaunchHelper(const Context& ctx, #ifdef PADDLE_WITH_HIP hipMemset(x_grad_data, 0, memset_bytes_x); hipMemset(y_grad_data, 0, memset_bytes_y); +#elif defined(PADDLE_WITH_MUSA) + musaMemset(x_grad_data, 0, memset_bytes_x); + musaMemset(y_grad_data, 0, memset_bytes_y); #else cudaMemset(x_grad_data, 0, memset_bytes_x); cudaMemset(y_grad_data, 0, memset_bytes_y); diff --git a/paddle/phi/kernels/gpu/top_k_kernel.cu b/paddle/phi/kernels/gpu/top_k_kernel.cu index bef328ec21a20..0bbbd079f9738 100644 --- a/paddle/phi/kernels/gpu/top_k_kernel.cu +++ b/paddle/phi/kernels/gpu/top_k_kernel.cu @@ -198,7 +198,7 @@ void TopkKernel(const Context& dev_ctx, gridx, input_height, largest)); -#else +#else // CUDA & MUSA FIXED_BLOCK_DIM(switch (phi::funcs::getMaxLength(k)) { FIXED_MAXLENGTH( phi::funcs::KeMatrixTopK @@ -307,7 +307,7 @@ void TopkKernel(const Context& dev_ctx, gridx, input_height, largest)); -#else +#else // CUDA & MUSA FIXED_BLOCK_DIM(switch (phi::funcs::getMaxLength(k)) { FIXED_MAXLENGTH(phi::funcs::KeMatrixTopK <<>>( diff --git a/paddle/phi/kernels/gpudnn/affine_grid_grad_kernel.cu b/paddle/phi/kernels/gpudnn/affine_grid_grad_kernel.cu index 2a3c9515ac2ea..255948e1f6570 100644 --- a/paddle/phi/kernels/gpudnn/affine_grid_grad_kernel.cu +++ b/paddle/phi/kernels/gpudnn/affine_grid_grad_kernel.cu @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef PADDLE_WITH_HIP +#if !defined(PADDLE_WITH_HIP) || !defined(PADDLE_WITH_MUSA) #include "paddle/phi/kernels/affine_grid_grad_kernel.h" #include "paddle/phi/backends/all_context.h" diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu index 2c6e898fa25c8..1b01f2b8131c9 100644 --- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu @@ -22,6 +22,8 @@ #include "paddle/phi/core/kernel_registry.h" #ifdef PADDLE_WITH_HIP #include "paddle/phi/kernels/gpudnn/conv_miopen_helper.h" +#elif defined(PADDLE_WITH_MUSA) +#include "paddle/phi/kernels/gpudnn/conv_mudnn_helper.h" #else #include "paddle/phi/kernels/gpudnn/conv_cudnn_v7.h" #endif @@ -138,6 +140,9 @@ void ConvCudnnGradKernelImplV7( #ifdef PADDLE_WITH_HIP SearchResult bwd_result; SearchResult filter_result; +#elif defined(PADDLE_WITH_MUSA) + SearchResult bwd_result; + SearchResult filter_result; #else SearchResult bwd_result; SearchResult filter_result; @@ -146,7 +151,7 @@ void ConvCudnnGradKernelImplV7( int iwo_groups = groups; int c_groups = 1; -#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) +#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) || defined(PADDLE_WITH_MUSA) iwo_groups = 1; c_groups = groups; groups = 1; @@ -172,7 +177,7 @@ void ConvCudnnGradKernelImplV7( workspace_size = std::max(workspace_size, search1::GetWorkspaceSize(args1)); bwd_result.algo = search1::Find( args1, exhaustive_search, deterministic, workspace_size, ctx); -#else +#else // CUDA & MUSA using search1 = SearchAlgorithm; bwd_result = search1::Find(ctx, args1, exhaustive_search, deterministic); workspace_size = std::max(workspace_size, bwd_result.workspace_size); @@ -198,7 +203,7 @@ void ConvCudnnGradKernelImplV7( workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2)); filter_result.algo = search2::Find( args2, exhaustive_search, deterministic, workspace_size, ctx); -#else +#else // CUDA & MUSA using search2 = SearchAlgorithm; filter_result = search2::Find(ctx, args2, exhaustive_search, deterministic); @@ -213,7 +218,7 @@ void ConvCudnnGradKernelImplV7( #ifdef PADDLE_WITH_HIP // MIOPEN ONLY support beta to be 0.0f ScalingParamType beta = 0.0f; -#else +#else // CUDA & MUSA ScalingParamType beta = use_addto ? 1.0f : 0.0f; #endif @@ -278,7 +283,7 @@ void ConvCudnnGradKernelImplV7( }, workspace_size); } -#else +#else // CUDA & MUSA ConvRunner::Apply(ctx, args1, bwd_result, @@ -318,7 +323,7 @@ void ConvCudnnGradKernelImplV7( workspace_size)); }, workspace_size); -#else +#else // MUSA & CUDA ConvRunner::Apply(ctx, args2, filter_result, @@ -455,7 +460,7 @@ void ConvCudnnGradKernel(const Context& ctx, #ifdef PADDLE_WITH_HIP // HIP MIOPEN ONLY SUPPORT NCHW format auto compute_format = phi::backends::gpu::DataLayout::kNCHW; -#else +#else // MUSA & CUDA #if CUDNN_VERSION_MIN(8, 1, 0) const bool compute_in_nhwc = (dtype == CUDNN_DATA_HALF || dtype == CUDNN_DATA_BFLOAT16) && @@ -1004,7 +1009,7 @@ void ConvCudnnGradGradKernel( int iwo_group = groups; int c_group = 1; -#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) +#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) || defined(PADDLE_WITH_MUSA) iwo_group = 1; c_group = groups; groups = 1; @@ -1061,6 +1066,11 @@ void ConvCudnnGradGradKernel( SearchResult fwd_result2; SearchResult data_result; SearchResult filter_result; +#elif defined(PADDLE_WITH_MUSA) + SearchResult fwd_result1; + SearchResult fwd_result2; + SearchResult data_result; + SearchResult filter_result; #else SearchResult fwd_result1; SearchResult fwd_result2; @@ -1091,7 +1101,7 @@ void ConvCudnnGradGradKernel( workspace_size = search1::GetWorkspaceSize(args1); fwd_result1.algo = search1::Find( args1, exhaustive_search, false, workspace_size, ctx); -#else +#else // CUDA & MUSA using search1 = SearchAlgorithm; fwd_result1 = search1::Find(ctx, args1, exhaustive_search, false); workspace_size = search1::GetWorkspaceSize(args1, fwd_result1.algo); @@ -1116,7 +1126,7 @@ void ConvCudnnGradGradKernel( std::max(workspace_size, search2::GetWorkspaceSize(args2)); fwd_result2.algo = search2::Find( args2, exhaustive_search, false, workspace_size, ctx); -#else +#else // CUDA & MUSA using search2 = SearchAlgorithm; fwd_result2 = search2::Find(ctx, args2, exhaustive_search, false); workspace_size = std::max( @@ -1142,7 +1152,7 @@ void ConvCudnnGradGradKernel( workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3)); filter_result.algo = search3::Find( args3, exhaustive_search, deterministic, workspace_size, ctx); -#else +#else // CUDA & MUSA using search3 = SearchAlgorithm; filter_result = search3::Find(ctx, args3, exhaustive_search, deterministic); @@ -1169,7 +1179,7 @@ void ConvCudnnGradGradKernel( workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4)); data_result.algo = search4::Find( args4, exhaustive_search, deterministic, workspace_size, ctx); -#else +#else // CUDA & MUSA using search4 = SearchAlgorithm; data_result = search4::Find(ctx, args4, exhaustive_search, deterministic); @@ -1226,7 +1236,7 @@ void ConvCudnnGradGradKernel( workspace_size)); }, workspace_size); -#else +#else // MUSA & CUDA ConvRunner::Apply(ctx, args1, fwd_result1, @@ -1345,7 +1355,7 @@ void ConvCudnnGradGradKernel( workspace_size)); }, workspace_size); -#else +#else // CUDA & MUSA ConvRunner::Apply(ctx, args4, data_result, @@ -1540,7 +1550,7 @@ PD_REGISTER_KERNEL(depthwise_conv2d_double_grad, double, phi::dtype::float16, phi::dtype::bfloat16) {} -#else +#else // CUDA & MUSA PD_REGISTER_KERNEL(conv2d_grad, GPUDNN, ALL_LAYOUT, diff --git a/paddle/phi/kernels/gpudnn/conv_kernel.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu index 15161dd61c697..e73ce989f0306 100644 --- a/paddle/phi/kernels/gpudnn/conv_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_kernel.cu @@ -23,6 +23,8 @@ #ifdef PADDLE_WITH_HIP #include "paddle/phi/kernels/gpudnn/conv_miopen_helper.h" +#elif defined(PADDLE_WITH_MUSA) +#include "paddle/phi/kernels/gpudnn/conv_mudnn_helper.h" #else #include "paddle/phi/kernels/gpudnn/conv_cudnn_v7.h" #endif @@ -84,7 +86,7 @@ void ConvCudnnKernelImplV7(const DenseTensor* transformed_input, // MIOPEN need to set groups in cdesc in miopen_desc.h args.cdesc.set( dtype, padding_common, strides, dilations, phi::AllowTF32Cudnn(), groups); -#else +#else // CUDA & MUSA args.cdesc.set( dtype, padding_common, strides, dilations, phi::AllowTF32Cudnn()); #endif @@ -151,6 +153,11 @@ void ConvCudnnKernelImplV7(const DenseTensor* transformed_input, workspace_size = search::GetWorkspaceSize(args); fwd_result.algo = search::Find( args, exhaustive_search, deterministic, workspace_size, ctx); +#elif defined(PADDLE_WITH_MUSA) + SearchResult fwd_result; + using search = SearchAlgorithm; + fwd_result = search::Find(ctx, args, exhaustive_search, deterministic); + workspace_size = fwd_result.workspace_size; #else SearchResult fwd_result; using search = SearchAlgorithm; @@ -195,7 +202,7 @@ void ConvCudnnKernelImplV7(const DenseTensor* transformed_input, workspace_size)); }, workspace_size); -#else +#else // CUDA & MUSA ConvRunner::Apply(ctx, args, fwd_result, @@ -363,7 +370,7 @@ void ConvCudnnKernel(const Context& ctx, const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); auto dtype = phi::backends::gpu::CudnnDataType::type; -#ifdef PADDLE_WITH_HIP +#ifd defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) // HIP MIOPEN ONLY SUPPORT NCHW format auto compute_format = phi::backends::gpu::DataLayout::kNCHW; #else @@ -651,7 +658,7 @@ PD_REGISTER_KERNEL(conv3d, double, phi::dtype::float16, phi::dtype::bfloat16) {} -#else +#else // CUDA & MUSA PD_REGISTER_KERNEL(conv2d, GPUDNN, ALL_LAYOUT, diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu index 50bae0a8bca3e..f30361864dbb0 100644 --- a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu @@ -32,6 +32,9 @@ limitations under the License. */ #ifdef PADDLE_WITH_HIP #include "paddle/phi/backends/gpu/rocm/miopen_helper.h" #include "paddle/phi/kernels/gpudnn/conv_miopen_helper.h" +#elif defined(PADDLE_WITH_MUSA) +#include "paddle/phi/backends/gpu/musa/musa_helper.h" +#include "paddle/phi/kernels/gpudnn/conv_musa_helper.h" #else #include "paddle/phi/backends/gpu/cuda/cudnn_helper.h" #include "paddle/phi/kernels/gpudnn/conv_cudnn_v7.h" @@ -167,7 +170,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, int iwo_groups = groups; int c_groups = 1; -#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) +#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) || defined(PADDLE_WITH_MUSA) iwo_groups = 1; c_groups = groups; groups = 1; @@ -200,6 +203,9 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, #ifdef PADDLE_WITH_HIP SearchResult fwd_result; SearchResult filter_result; +#elif defined(PADDLE_WITH_MUSA) + SearchResult fwd_result; + SearchResult filter_result; #else SearchResult fwd_result; SearchResult filter_result; @@ -228,7 +234,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, workspace_size = std::max(workspace_size, search1::GetWorkspaceSize(args1)); fwd_result.algo = search1::Find(args1, false, deterministic, workspace_size, ctx); -#else +#else // MUSA & CUDA using search1 = SearchAlgorithm; fwd_result = search1::Find(ctx, args1, false, deterministic, false); workspace_size = std::max( @@ -253,7 +259,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2)); filter_result.algo = search2::Find(args2, false, deterministic, workspace_size, ctx); -#else +#else // CUDA & MUSA using search2 = SearchAlgorithm; filter_result = search2::Find(ctx, args2, false, deterministic, false); workspace_size = std::max( @@ -292,7 +298,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, }; workspace_handle.RunFunc(cudnn_func, workspace_size); } -#else // PADDLE_WITH_HIP +#else // CUDA & MUSA ConvRunner::Apply(ctx, args1, fwd_result, @@ -349,7 +355,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, }; workspace_handle.RunFunc(cudnn_func, workspace_size); } -#else // PADDLE_WITH_HIP +#else // CUDA & MUSA ConvRunner::Apply(ctx, args2, filter_result, @@ -363,7 +369,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, workspace_size, &workspace_handle, false); -#endif // PADDLE_WITH_HIP +#endif } } @@ -613,7 +619,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( int iwo_group = groups; int c_group = 1; -#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) +#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) || defined(PADDLE_WITH_MUSA) iwo_group = 1; c_group = groups; groups = 1; @@ -670,6 +676,11 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( SearchResult bwd_result2; SearchResult filter_result; SearchResult fwd_result; +#elif defined(PADDLE_WITH_MUSA) + SearchResult bwd_result1; + SearchResult bwd_result2; + SearchResult filter_result; + SearchResult fwd_result; #else SearchResult bwd_result1; SearchResult bwd_result2; @@ -700,7 +711,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( workspace_size = search1::GetWorkspaceSize(args1); bwd_result1.algo = search1::Find(args1, false, deterministic, workspace_size, ctx); -#else +#else // CUDA & MUSA using search1 = SearchAlgorithm; bwd_result1 = search1::Find(ctx, args1, false, deterministic, false); workspace_size = search1::GetWorkspaceSize(args1, bwd_result1.algo); @@ -722,7 +733,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2)); bwd_result2.algo = search2::Find(args2, false, deterministic, workspace_size, ctx); -#else +#else // CUDA & MUSA using search2 = SearchAlgorithm; bwd_result2 = search2::Find(ctx, args2, false, deterministic, false); workspace_size = std::max( @@ -747,7 +758,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3)); filter_result.algo = search3::Find(args3, false, deterministic, workspace_size, ctx); -#else +#else // CUDA & MUSA using search3 = SearchAlgorithm; filter_result = search3::Find(ctx, args3, false, deterministic, false); workspace_size = std::max( @@ -773,7 +784,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4)); fwd_result.algo = search4::Find(args4, false, deterministic, workspace_size, ctx); -#else +#else // CUDA & MUSA using search4 = SearchAlgorithm; fwd_result = search4::Find(ctx, args4, false, deterministic, false); workspace_size = std::max( @@ -833,7 +844,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( }, workspace_size); } -#else // PADDLE_WITH_HIP +#else // CUDA & MUSA ConvRunner::Apply(ctx, args1, bwd_result1, @@ -847,7 +858,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( workspace_size, &workspace_handle, false); -#endif // PADDLE_WITH_HIP +#endif #ifdef PADDLE_WITH_HIP for (int i = 0; i < groups; i++) { @@ -886,7 +897,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( args2.idesc.desc(), transformed_ddout_channel_ + i * group_offset_out)); } -#else // PADDLE_WITH_HIP +#else // CUDA & MUSA ConvRunner::Apply(ctx, args2, bwd_result2, @@ -900,7 +911,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( workspace_size, &workspace_handle, true); -#endif // PADDLE_WITH_HIP +#endif if ((!is_sys_pad) && (!channel_last)) { if (strides.size() == 2U) { @@ -956,7 +967,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( }, workspace_size); } -#else // PADDLE_WITH_HIP +#else // MUSA & CUDA ConvRunner::Apply(ctx, args3, filter_result, @@ -970,7 +981,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( workspace_size, &workspace_handle, false); -#endif // PADDLE_WITH_HIP +#endif } if (dx) { @@ -996,7 +1007,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( }, workspace_size); } -#else // PADDLE_WITH_HIP +#else // MUSA & CUDA ConvRunner::Apply(ctx, args4, fwd_result, @@ -1010,7 +1021,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( workspace_size, &workspace_handle, false); -#endif // PADDLE_WITH_HIP +#endif if (channel_last) { TransToChannelLast(ctx, &transformed_dx_channel, dx); @@ -1097,7 +1108,7 @@ PD_REGISTER_KERNEL(conv3d_transpose_grad, double, float16, phi::dtype::bfloat16) {} -#else +#else // CUDA & MUSA PD_REGISTER_KERNEL(conv2d_transpose_grad, GPUDNN, ALL_LAYOUT, diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu index df360ab388a6d..ed64723a40e4f 100644 --- a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu @@ -30,6 +30,9 @@ limitations under the License. */ #ifdef PADDLE_WITH_HIP #include "paddle/phi/backends/gpu/rocm/miopen_helper.h" #include "paddle/phi/kernels/gpudnn/conv_miopen_helper.h" +#elif defined(PADDLE_WITH_MUSA) +#include "paddle/phi/backends/gpu/rocm/mudnn_helper.h" +#include "paddle/phi/kernels/gpudnn/conv_mudnn_helper.h" #else #include "paddle/phi/backends/gpu/cuda/cudnn_helper.h" #include "paddle/phi/kernels/gpudnn/conv_cudnn_v7.h" @@ -176,7 +179,7 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx, int iwo_groups = groups; int c_groups = 1; -#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) +#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) || defined(PADDLE_WTIH_MUSA) iwo_groups = 1; c_groups = groups; groups = 1; @@ -191,6 +194,8 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx, size_t workspace_size = 0; #ifdef PADDLE_WITH_HIP miopenConvBwdDataAlgorithm_t algo{}; +#elif defined(PADDLE_WITH_MUSA) + mudnnConvBwdDataAlgorithm_t algo{}; #else cudnnConvolutionBwdDataAlgo_t algo{}; #endif @@ -227,6 +232,12 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx, workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args)); bwd_result.algo = search::Find(args, false, deterministic, workspace_size, ctx); +#elif defined(PADDLE_WITH_MUSA) + SearchResult bwd_result; + using search = SearchAlgorithm; + workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args)); + bwd_result.algo = + search::Find(args, false, deterministic, workspace_size, ctx); #else SearchResult bwd_result; using search = SearchAlgorithm; @@ -262,7 +273,7 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx, }; workspace_handle.RunFunc(cudnn_func, workspace_size); } -#else // PADDLE_WITH_HIP +#else // CUDA & MUSA ConvRunner::Apply(ctx, args, bwd_result, @@ -276,7 +287,7 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx, workspace_size, &workspace_handle, false); -#endif // PADDLE_WITH_HIP +#endif if (!is_sys_pad && strides.size() == 2U) { funcs::Slice(ctx, &transformed_out, out, starts, ends, axes); @@ -385,7 +396,7 @@ PD_REGISTER_KERNEL(conv3d_transpose, double, float16, phi::dtype::bfloat16) {} -#else +#else // CUDA & MUSA PD_REGISTER_KERNEL(conv2d_transpose, GPUDNN, ALL_LAYOUT, diff --git a/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu b/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu index 1161040f2163f..a52e0e37d0e71 100644 --- a/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu +++ b/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu @@ -154,7 +154,7 @@ void PoolGradRawGPUDNNKernel(const Context& ctx, // input grad transformed_input_grad.Resize(make_ddim(in_dims_vec)); -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) // MIOPEN not support NHWC data layout } else if (data_format == str_NHWC) { layout = GPUDNNDataLayout::kNCHW; @@ -217,6 +217,11 @@ void PoolGradRawGPUDNNKernel(const Context& ctx, layout, vectorize(transformed_input.dims())); miopenTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( layout, vectorize(transformed_output.dims())); +#elif defined(PADDLE_WITH_MUSA) + mudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( + layout, vectorize(transformed_input.dims())); + mudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( + layout, vectorize(transformed_output.dims())); #else cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( layout, vectorize(transformed_input.dims())); @@ -238,6 +243,9 @@ void PoolGradRawGPUDNNKernel(const Context& ctx, #ifdef PADDLE_WITH_HIP miopenPoolingDescriptor_t cudnn_pool_desc = pool_desc.descriptor(pooling_mode, kernel_size_, paddings_, strides); +#elif defined(PADDLE_WITH_MUSA) + mudnnPoolingDescriptor_t cudnn_pool_desc = + pool_desc.descriptor(pooling_mode, kernel_size_, paddings_, strides); #else cudnnPoolingDescriptor_t cudnn_pool_desc = pool_desc.descriptor(pooling_mode, kernel_size_, paddings_, strides); @@ -269,6 +277,17 @@ void PoolGradRawGPUDNNKernel(const Context& ctx, input_grad_data, pool_workspace)); PADDLE_ENFORCE_GPU_SUCCESS(hipFree(pool_workspace)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(dynload::mudnnPoolingBackward(handle, + cudnn_pool_desc, + &alpha, + cudnn_output_desc, + output_data, + cudnn_output_desc, + output_grad_data, + cudnn_input_desc, + input_data, + &beta, #else PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnPoolingBackward(handle, cudnn_pool_desc, @@ -289,7 +308,7 @@ void PoolGradRawGPUDNNKernel(const Context& ctx, funcs::Transpose trans5_v4; trans5_v4(ctx, transformed_input_grad, input_grad, axis); } -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) // MIOPEN not support NHWC data layout if (data_format == str_NHWC) { std::vector axis{0, 2, 3, 1}; @@ -424,7 +443,7 @@ PD_REGISTER_KERNEL(pool3d_grad, phi::Pool3dGradGPUDNNKernel, float, float16) {} -#else +#else // CUDA & MUSA PD_REGISTER_KERNEL(pool2d_grad, GPUDNN, ALL_LAYOUT, diff --git a/paddle/phi/kernels/gpudnn/pool_kernel.cu b/paddle/phi/kernels/gpudnn/pool_kernel.cu index b1a79dd874068..8a6ceb29690d2 100644 --- a/paddle/phi/kernels/gpudnn/pool_kernel.cu +++ b/paddle/phi/kernels/gpudnn/pool_kernel.cu @@ -111,8 +111,8 @@ void PoolRawGPUDNNKernel(const Context& ctx, out_dims_vec[3] = output->dims()[2]; out_dims_vec[4] = output->dims()[3]; transformed_output.Resize(make_ddim(out_dims_vec)); -#ifdef PADDLE_WITH_HIP - // MIOPEN not support NHWC data layout +#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) + // MIOPEN and MUDNN not support NHWC data layout } else if (data_format == str_NHWC) { layout = GPUDNNDataLayout::kNCHW; @@ -155,6 +155,11 @@ void PoolRawGPUDNNKernel(const Context& ctx, layout, vectorize(transformed_input.dims())); miopenTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( layout, vectorize(transformed_output.dims())); +#elif defined(PADDLE_WITH_MUSA) + mudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( + layout, vectorize(transformed_input.dims())); + mudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( + layout, vectorize(transformed_output.dims())); #else cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( layout, vectorize(transformed_input.dims())); @@ -172,6 +177,9 @@ void PoolRawGPUDNNKernel(const Context& ctx, #ifdef PADDLE_WITH_HIP miopenPoolingDescriptor_t cudnn_pool_desc = pool_desc.descriptor(pooling_mode, kernel_size_, paddings_, strides); +#elif defined(PADDLE_WITH_MUSA) + mudnnPoolingDescriptor_t cudnn_pool_desc = + pool_desc.descriptor(pooling_mode, kernel_size_, paddings_, strides); #else cudnnPoolingDescriptor_t cudnn_pool_desc = pool_desc.descriptor(pooling_mode, kernel_size_, paddings_, strides); @@ -200,6 +208,16 @@ void PoolRawGPUDNNKernel(const Context& ctx, pool_workspace, pool_workernel_size_)); PADDLE_ENFORCE_GPU_SUCCESS(hipFree(pool_workspace)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + dynload::mudnnPoolingForward(handle, + cudnn_pool_desc, + &alpha, + cudnn_input_desc, + tranformed_input_data, + &beta, + cudnn_output_desc, + tranformed_output_data)); #else PADDLE_ENFORCE_GPU_SUCCESS( dynload::cudnnPoolingForward(handle, @@ -217,7 +235,7 @@ void PoolRawGPUDNNKernel(const Context& ctx, funcs::Transpose trans5_v2; trans5_v2(ctx, transformed_output, output, axis); } -#ifdef PADDLE_WITH_HIP +#elif defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) // MIOPEN not support NHWC data layout if (data_format == str_NHWC) { std::vector axis{0, 2, 3, 1}; @@ -295,7 +313,7 @@ PD_REGISTER_KERNEL( pool2d, GPUDNN, ALL_LAYOUT, phi::Pool2dGPUDNNKernel, float, float16) {} PD_REGISTER_KERNEL( pool3d, GPUDNN, ALL_LAYOUT, phi::Pool3dGPUDNNKernel, float, float16) {} -#else +#else // CUDA & MUSA PD_REGISTER_KERNEL(pool2d, GPUDNN, ALL_LAYOUT, diff --git a/paddle/phi/kernels/gpudnn/softmax_grad_kernel.cu b/paddle/phi/kernels/gpudnn/softmax_grad_kernel.cu index 72a5f37d14005..93dff54fa128c 100644 --- a/paddle/phi/kernels/gpudnn/softmax_grad_kernel.cu +++ b/paddle/phi/kernels/gpudnn/softmax_grad_kernel.cu @@ -59,7 +59,7 @@ PD_REGISTER_KERNEL(softmax_grad, double, phi::dtype::float16, phi::dtype::bfloat16) {} -#else +#else // CUDA & MUSA PD_REGISTER_KERNEL(softmax_grad, GPUDNN, ALL_LAYOUT, diff --git a/paddle/phi/kernels/impl/conv_cudnn_impl.h b/paddle/phi/kernels/impl/conv_cudnn_impl.h index c918eeec83121..acf2fd4808814 100644 --- a/paddle/phi/kernels/impl/conv_cudnn_impl.h +++ b/paddle/phi/kernels/impl/conv_cudnn_impl.h @@ -19,6 +19,8 @@ #include "paddle/phi/core/kernel_registry.h" #ifdef PADDLE_WITH_HIP #include "paddle/phi/kernels/gpudnn/conv_miopen_helper.h" +#elif defined(PADDLE_WITH_MUSA) +// TODO #else #include "paddle/phi/kernels/gpudnn/conv_cudnn_v7.h" #endif diff --git a/paddle/phi/kernels/impl/isclose_kernel_impl.h b/paddle/phi/kernels/impl/isclose_kernel_impl.h index de59cb0c32ca1..f74094184e33f 100644 --- a/paddle/phi/kernels/impl/isclose_kernel_impl.h +++ b/paddle/phi/kernels/impl/isclose_kernel_impl.h @@ -145,6 +145,8 @@ struct IscloseFunctor { grid = (grid > block) ? block : grid; #ifdef PADDLE_WITH_HIP hipMemset(out_data, true, num * sizeof(bool)); +#elif defined(PADDLE_WITH_MUSA) + musaMemset(out_data, true, num * sizeof(bool)); #else cudaMemset(out_data, true, num * sizeof(bool)); #endif diff --git a/paddle/phi/kernels/impl/segment_pool_kernel_impl.h b/paddle/phi/kernels/impl/segment_pool_kernel_impl.h index 82b99b07a8927..f5a0998505dce 100644 --- a/paddle/phi/kernels/impl/segment_pool_kernel_impl.h +++ b/paddle/phi/kernels/impl/segment_pool_kernel_impl.h @@ -77,6 +77,11 @@ void SegmentKernelLaunchHelper(const Context& dev_ctx, segment_ids_ptr + num_indices - 1, sizeof(IndexT), hipMemcpyDeviceToHost)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpy(length_data, + segment_ids_ptr + num_indices - 1, + sizeof(IndexT), + musaMemcpyDeviceToHost)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(length_data, segment_ids_ptr + num_indices - 1, diff --git a/paddle/phi/kernels/primitive/datamover_primitives.h b/paddle/phi/kernels/primitive/datamover_primitives.h index 2a3579d99cfe6..8778dc144e503 100644 --- a/paddle/phi/kernels/primitive/datamover_primitives.h +++ b/paddle/phi/kernels/primitive/datamover_primitives.h @@ -20,6 +20,10 @@ #ifdef PADDLE_WITH_HIP #include #endif +#ifdef PADDLE_WITH_MUSA +#include +#include +#endif #include "paddle/phi/core/ddim.h" namespace phi { diff --git a/paddle/phi/kernels/reduce_min_kernel.cc b/paddle/phi/kernels/reduce_min_kernel.cc index ff50e9d1077b0..c5219c32cb743 100644 --- a/paddle/phi/kernels/reduce_min_kernel.cc +++ b/paddle/phi/kernels/reduce_min_kernel.cc @@ -57,6 +57,11 @@ PD_REGISTER_KERNEL( min, GPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t) {} #endif +#if defined(PADDLE_WITH_MUSA) +PD_REGISTER_KERNEL() + min, GPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t, phi::dtype::float16) {} +#endif + #if defined(PADDLE_WITH_XPU_KP) && !defined(PADDLE_WITH_XPU) PD_REGISTER_KERNEL(min, KPS, ALL_LAYOUT, phi::MinKernel, float) {} #endif diff --git a/paddle/phi/kernels/sparse/batch_norm_grad_kernel.cc b/paddle/phi/kernels/sparse/batch_norm_grad_kernel.cc index ff3173ec0a101..4bd01b667516b 100644 --- a/paddle/phi/kernels/sparse/batch_norm_grad_kernel.cc +++ b/paddle/phi/kernels/sparse/batch_norm_grad_kernel.cc @@ -88,7 +88,7 @@ PD_REGISTER_KERNEL(batch_norm_coo_grad, } #endif -#if defined(PADDLE_WITH_CUDA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(batch_norm_coo_grad, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/sparse/batch_norm_kernel.cc b/paddle/phi/kernels/sparse/batch_norm_kernel.cc index 04ab36892513c..5ea531bbab1c4 100644 --- a/paddle/phi/kernels/sparse/batch_norm_kernel.cc +++ b/paddle/phi/kernels/sparse/batch_norm_kernel.cc @@ -92,7 +92,7 @@ PD_REGISTER_KERNEL(batch_norm_coo, } #endif -#if defined(PADDLE_WITH_CUDA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(batch_norm_coo, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu b/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu index aaed804c92657..3366a86850bd2 100644 --- a/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu @@ -89,6 +89,8 @@ void CoalesceCooGPUKernel(const GPUContext& dev_ctx, // 3. sort (indices, values index) #ifdef PADDLE_WITH_HIP thrust::sort_by_key(thrust::hip::par.on(dev_ctx.stream()), +#elif defined(PADDLE_WITH_MUSA) + thrust::sort_by_key(thrust::musa::par.on(dev_ctx.stream()), #else thrust::sort_by_key(thrust::cuda::par.on(dev_ctx.stream()), #endif @@ -100,6 +102,8 @@ void CoalesceCooGPUKernel(const GPUContext& dev_ctx, thrust::pair new_end = #ifdef PADDLE_WITH_HIP thrust::unique_by_key(thrust::hip::par.on(dev_ctx.stream()), +#elif defined(PADDLE_WITH_MUSA) + thrust::unique_by_key(thrust::musa::par.on(dev_ctx.stream()), #else thrust::unique_by_key(thrust::cuda::par.on(dev_ctx.stream()), #endif diff --git a/paddle/phi/kernels/sparse/gpu/conv.cu.h b/paddle/phi/kernels/sparse/gpu/conv.cu.h index 689629c939338..68ca818bad303 100644 --- a/paddle/phi/kernels/sparse/gpu/conv.cu.h +++ b/paddle/phi/kernels/sparse/gpu/conv.cu.h @@ -603,6 +603,8 @@ inline void CallThrustScan(const GPUContext& dev_ctx, int* h_offsets_ptr) { #ifdef PADDLE_WITH_HIP thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()), +#elif definfed(PADDLE_WITH_MUSA) + thrust::exclusive_scan(thrust::musa::par.on(dev_ctx.stream()), #else thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()), #endif @@ -836,6 +838,8 @@ int ProductRuleBook(const Context& dev_ctx, // 2. remove -1 #ifdef PADDLE_WITH_HIP IntT* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()), +#elif defined(PADDLE_WITH_MUSA) + IntT* last = thrust::remove(thrust::musa::par.on(dev_ctx.stream()), #else IntT* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()), #endif @@ -884,6 +888,8 @@ int ProductRuleBook(const Context& dev_ctx, index_flags_ptr, index_flags.numel(), out_index_table_ptr); #ifdef PADDLE_WITH_HIP thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()), +#elif defined(PADDLE_WITH_MUSA) + thrust::exclusive_scan(thrust::musa::par.on(dev_ctx.stream()), #else thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()), #endif diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h index a7dcb6d514830..3bc6cd3b9ab92 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h +++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h @@ -78,6 +78,8 @@ inline IntT* SortedAndUniqueIndex(const Context& dev_ctx, sizeof(IntT) * len, #ifdef PADDLE_WITH_HIP hipMemcpyDeviceToDevice, +#elif defined(PADDLE_WITH_MUSA) + musaMemcpyDeviceToDevice, #else cudaMemcpyDeviceToDevice, #endif @@ -86,6 +88,8 @@ inline IntT* SortedAndUniqueIndex(const Context& dev_ctx, // performance, but thrust::merge_by_key limited by data size #ifdef PADDLE_WITH_HIP thrust::sort_by_key(thrust::hip::par.on(dev_ctx.stream()), +#elif defined(PADDLE_WITH_MUSA) + thrust::sort_by_key(thrust::musa::par.on(dev_ctx.stream()), #else thrust::sort_by_key(thrust::cuda::par.on(dev_ctx.stream()), #endif @@ -97,6 +101,8 @@ inline IntT* SortedAndUniqueIndex(const Context& dev_ctx, thrust::pair new_end = #ifdef PADDLE_WITH_HIP thrust::unique_by_key(thrust::hip::par.on(dev_ctx.stream()), +#elif defined(PADDLE_WITH_MUSA) + thrust::unique_by_key(thrust::musa::par.on(dev_ctx.stream()), #else thrust::unique_by_key(thrust::cuda::par.on(dev_ctx.stream()), #endif @@ -348,6 +354,8 @@ int ProductRuleBook(const Context& dev_ctx, // 2. remove -1 #ifdef PADDLE_WITH_HIP IntT* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()), +#elif defined(PADDLE_WITH_MUSA) + IntT* last = thrust::remove(thrust::musa::par.on(dev_ctx.stream()), #else IntT* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()), #endif @@ -364,6 +372,8 @@ int ProductRuleBook(const Context& dev_ctx, sizeof(IntT), #ifdef PADDLE_WITH_HIP hipMemcpyDeviceToHost, +#elif defined(PADDLE_WITH_MUSA) + musaMemcpyDeviceToHost, #else cudaMemcpyDeviceToHost, #endif @@ -388,6 +398,8 @@ int ProductRuleBook(const Context& dev_ctx, IntT* bound_ptr = bound.data(); #ifdef PADDLE_WITH_HIP thrust::lower_bound(thrust::hip::par.on(dev_ctx.stream()), +#elif defined(PADDLE_WITH_MUSA) + thrust::lower_bound(thrust::musa::par.on(dev_ctx.stream()), #else thrust::lower_bound(thrust::cuda::par.on(dev_ctx.stream()), #endif @@ -415,6 +427,8 @@ int ProductRuleBook(const Context& dev_ctx, // remove -1 #ifdef PADDLE_WITH_HIP IntT* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()), +#elif defined(PADDLE_WITH_MUSA) + IntT* last = thrust::remove(thrust::musa::par.on(dev_ctx.stream()), #else IntT* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()), #endif @@ -428,6 +442,8 @@ int ProductRuleBook(const Context& dev_ctx, sizeof(IntT), #ifdef PADDLE_WITH_HIP hipMemcpyDeviceToHost, +#elif defined(PADDLE_WITH_MUSA) + musaMemcpyDeviceToHost, #else cudaMemcpyDeviceToHost, #endif @@ -438,6 +454,8 @@ int ProductRuleBook(const Context& dev_ctx, #ifdef PADDLE_WITH_HIP thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()), +#elif defined(PADDLE_WITH_MUSA) + thrust::exclusive_scan(thrust::musa::par.on(dev_ctx.stream()), #else thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()), #endif @@ -450,6 +468,8 @@ int ProductRuleBook(const Context& dev_ctx, kernel_size * sizeof(int), #ifdef PADDLE_WITH_HIP hipMemcpyDeviceToHost, +#elif defined(PADDLE_WITH_MUSA) + musaMemcpyDeviceToHost, #else cudaMemcpyDeviceToHost, #endif @@ -460,6 +480,8 @@ int ProductRuleBook(const Context& dev_ctx, kernel_size * sizeof(int), #ifdef PADDLE_WITH_HIP hipMemcpyDeviceToHost, +#elif defined(PADDLE_WITH_MUSA) + musaMemcpyDeviceToHost, #else cudaMemcpyDeviceToHost, #endif @@ -501,6 +523,13 @@ int ProductRuleBook(const Context& dev_ctx, sizeof(IntT), hipMemcpyDeviceToHost, dev_ctx.stream()); +#elif defined(PADDLE_WITH_MUSA) + phi::backends::gpu::GpuMemcpyAsync( + &out_non_zero_num, + rulebook_ptr + rulebook_rows * rulebook_cols - 1, + sizeof(IntT), + musaMemcpyDeviceToHost, + dev_ctx.stream()); #else phi::backends::gpu::GpuMemcpyAsync( &out_non_zero_num, diff --git a/paddle/phi/kernels/sparse/gpu/elementwise_kernel.cu b/paddle/phi/kernels/sparse/gpu/elementwise_kernel.cu index 47daa1eae19ed..b9b340da8caee 100644 --- a/paddle/phi/kernels/sparse/gpu/elementwise_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/elementwise_kernel.cu @@ -43,6 +43,8 @@ void ElementWiseAddCooGPUKernel(const GPUContext& dev_ctx, const IntT* y_indices_ptr = y_indices.data(); #ifdef PADDLE_WITH_HIP bool is_same = thrust::equal(thrust::hip::par.on(dev_ctx.stream()), +#elif defined(PADDLE_WITH_MUSA) + bool is_same = thrust::equal(thrust::musa::par.on(dev_ctx.stream()), #else bool is_same = thrust::equal(thrust::cuda::par.on(dev_ctx.stream()), #endif diff --git a/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu index 7dbdbe2acc992..fc526adeacec5 100644 --- a/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu @@ -68,7 +68,7 @@ void MatmulCooDenseGradKernel(const Context& dev_ctx, set_zero(dev_ctx, dy, static_cast(0.0f)); sparse_blas.SPMM( true, false, static_cast(1), x_csr, dout, static_cast(0), dy); -#elif defined(PADDLE_WITH_CUDA) +#elif defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_MUSA) sparse_blas.SPMM( true, false, static_cast(1), x, dout, static_cast(0), dy); #endif @@ -84,6 +84,10 @@ void MatmulCooDenseGradKernel(const Context& dev_ctx, "rocsparse_sddmm with transpose, which is " "supported from " "ROCM 4.3.0")); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_THROW(phi::errors::Unimplemented( + "backward of 'sparse.matmul' use cusparseSDDMM, which is supported from " + "MUSA xxx")); #endif #endif } @@ -135,6 +139,10 @@ void MatmulCsrDenseGradKernel(const Context& dev_ctx, "rocsparse_sddmm with transpose, which is " "supported from " "ROCM 4.3.0")); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_THROW(phi::errors::Unimplemented( + "backward of 'sparse.matmul' use cusparseSDDMM, which is supported from " + "MUSA xxx")); #endif #endif } diff --git a/paddle/phi/kernels/sparse/gpu/pool_kernel.cu b/paddle/phi/kernels/sparse/gpu/pool_kernel.cu index 3f0ec2c2713e5..913581710dc3f 100644 --- a/paddle/phi/kernels/sparse/gpu/pool_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/pool_kernel.cu @@ -103,6 +103,8 @@ void MaxPoolCooGPUKernel(const GPUContext& dev_ctx, // 2. max pool #ifdef PADDLE_WITH_HIP thrust::fill(thrust::hip::par.on(dev_ctx.stream()), +#elif defined(PADDLE_WITH_MUSA) + thrust::fill(thrust::musa::par.on(dev_ctx.stream()), #else thrust::fill(thrust::cuda::par.on(dev_ctx.stream()), #endif diff --git a/paddle/phi/kernels/sparse/gpu/slice_kernel.cu b/paddle/phi/kernels/sparse/gpu/slice_kernel.cu index f47accfc8eff8..c998de2df3e46 100644 --- a/paddle/phi/kernels/sparse/gpu/slice_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/slice_kernel.cu @@ -178,6 +178,8 @@ void SliceCooGPUCompute(const Context& dev_ctx, d_out_nnz_indices.Resize({out_nnz}); #ifdef PADDLE_WITH_HIP thrust::sort(thrust::hip::par.on(dev_ctx.stream()), +#elif defined(PADDLE_WITH_MUSA) + thrust::sort(thrust::musa::par.on(dev_ctx.stream()), #else thrust::sort(thrust::cuda::par.on(dev_ctx.stream()), #endif @@ -322,6 +324,8 @@ void SliceCsrTensor2D(const Context& dev_ctx, out_crows_data); #ifdef PADDLE_WITH_HIP thrust::inclusive_scan(thrust::hip::par.on(dev_ctx.stream()), +#ifdef PADDLE_WITH_MUSA + thrust::inclusive_scan(thrust::musa::par.on(dev_ctx.stream()), #else thrust::inclusive_scan(thrust::cuda::par.on(dev_ctx.stream()), #endif @@ -471,6 +475,8 @@ void SliceCsrTensor3D(const Context& dev_ctx, #ifdef PADDLE_WITH_HIP thrust::inclusive_scan(thrust::hip::par.on(dev_ctx.stream()), +#elif defined(PADDLE_WITH_MUSA) + thrust::inclusive_scan(thrust::musa::par.on(dev_ctx.stream()), #else thrust::inclusive_scan(thrust::cuda::par.on(dev_ctx.stream()), #endif @@ -536,6 +542,8 @@ void SliceCsrTensor3D(const Context& dev_ctx, int64_t out_nnz = #ifdef PADDLE_WITH_HIP thrust::reduce(thrust::hip::par.on(dev_ctx.stream()), +#elif defined(PADDLE_WITH_MUSA) + thrust::reduce(thrust::musa::par.on(dev_ctx.stream()), #else thrust::reduce(thrust::cuda::par.on(dev_ctx.stream()), #endif @@ -545,6 +553,8 @@ void SliceCsrTensor3D(const Context& dev_ctx, int64_t st = i * (out_n_rows + 1); #ifdef PADDLE_WITH_HIP thrust::inclusive_scan(thrust::hip::par.on(dev_ctx.stream()), +#elif defined(PADDLE_WITH_MUSA) + thrust::inclusive_scan(thrust::musa::par.on(dev_ctx.stream()), #else thrust::inclusive_scan(thrust::cuda::par.on(dev_ctx.stream()), #endif @@ -554,6 +564,8 @@ void SliceCsrTensor3D(const Context& dev_ctx, } #ifdef PADDLE_WITH_HIP thrust::inclusive_scan(thrust::hip::par.on(dev_ctx.stream()), +#elif defined(PADDLE_WITH_MUSA) + thrust::inclusive_scan(thrust::musa::par.on(dev_ctx.stream()), #else thrust::inclusive_scan(thrust::cuda::par.on(dev_ctx.stream()), #endif diff --git a/paddle/phi/kernels/sparse/gpu/softmax_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/softmax_grad_kernel.cu index cf3dc79c8edd0..7be1b96b7ba52 100644 --- a/paddle/phi/kernels/sparse/gpu/softmax_grad_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/softmax_grad_kernel.cu @@ -216,6 +216,9 @@ void SoftmaxCooGradGPUKernel(const Context& dev_ctx, #ifdef PADDLE_WITH_HIP const auto& policy = thrust::hip::par.on(dev_ctx.stream()); bool is_same_offset = thrust::equal(thrust::hip::par.on(dev_ctx.stream()), +#elif defined(PADDLE_WITH_MUSA) + const auto& policy = thrust::musa::par.on(dev_ctx.stream()); + bool is_same_offset = thrust::equal(thrust::hip::par.on(dev_ctx.stream()), #else const auto& policy = thrust::cuda::par.on(dev_ctx.stream()); bool is_same_offset = thrust::equal(thrust::cuda::par.on(dev_ctx.stream()), diff --git a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu index 084cb0e60bb6d..abc1f18f984b2 100644 --- a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu @@ -19,6 +19,8 @@ limitations under the License. */ #ifdef PADDLE_WITH_HIP #include "paddle/phi/backends/dynload/rocsparse.h" +#elif defined(PADDLE_WITH_MUSA) +#include "paddle/phi/backends/dynload/musparse.h" #endif #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" @@ -132,6 +134,8 @@ void DenseToCooKernel(const Context& dev_ctx, #ifdef PADDLE_WITH_HIP thrust::remove(thrust::hip::par.on(dev_ctx.stream()), +#elif defined(PADDLE_WITH_MUSA) + thrust::remove(thrust::musa::par.on(dev_ctx.stream()), #else thrust::remove(thrust::cuda::par.on(dev_ctx.stream()), #endif @@ -228,7 +232,7 @@ void CsrToCooGPUKernel(const GPUContext& dev_ctx, if (x.nnz() <= 0) { #ifdef PADDLE_WITH_HIP DenseTensor indices = phi::Empty(dev_ctx, {sparse_dim, non_zero_num}); -#else +#else // MUSA and CUDA DenseTensor indices = phi::Empty(dev_ctx, {sparse_dim, non_zero_num}); #endif DenseTensor values = phi::EmptyLike(dev_ctx, x.values()); @@ -243,7 +247,7 @@ void CsrToCooGPUKernel(const GPUContext& dev_ctx, const auto& csr_cols = Cast(dev_ctx, x.cols(), DataType::INT32); const int* csr_crows_data = csr_crows.template data(); const int* csr_cols_data = csr_cols.template data(); -#else +#else // MUSA & CUDA const auto& csr_crows = x.crows(); const auto& csr_cols = x.cols(); const IntT* csr_crows_data = csr_crows.data(); @@ -260,7 +264,7 @@ void CsrToCooGPUKernel(const GPUContext& dev_ctx, int* coo_indices = indices.data(); int* coo_rows_data = coo_indices; int* coo_cols_data = coo_rows_data + non_zero_num; -#else +#else // MUSA & CUDA DenseTensor indices = phi::Empty(dev_ctx, {sparse_dim, non_zero_num}); DenseTensor offsets = phi::Empty(dev_ctx, {batches}); IntT* coo_indices = indices.data(); @@ -299,7 +303,7 @@ void CsrToCooGPUKernel(const GPUContext& dev_ctx, coo_rows_data, rocsparse_index_base_zero); }); -#else +#else // MUSA & CUDA auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rows, 1); config.block_per_grid.y = batches; ConvertCsrCrowsToCooRows @@ -310,7 +314,7 @@ void CsrToCooGPUKernel(const GPUContext& dev_ctx, csr_cols_data, #ifdef PADDLE_WITH_HIP sizeof(int) * non_zero_num, -#else +#else // MUSA & CUDA sizeof(IntT) * non_zero_num, #endif gpuMemcpyDeviceToDevice, diff --git a/paddle/phi/kernels/strings/gpu/copy_utils.h b/paddle/phi/kernels/strings/gpu/copy_utils.h index a6c2aba97b5e8..c462ddec7a351 100644 --- a/paddle/phi/kernels/strings/gpu/copy_utils.h +++ b/paddle/phi/kernels/strings/gpu/copy_utils.h @@ -83,6 +83,9 @@ int GetAllStringsSize(const Context& dev_ctx, #ifdef PADDLE_WITH_HIP phi::backends::gpu::GpuMemcpyAsync( &num, nums_ptr, sizeof(int), hipMemcpyDeviceToHost, dev_ctx.stream()); +#elif defined(PADDLE_WITH_MUSA) + phi::backends::gpu::GpuMemcpyAsync( + &num, nums_ptr, sizeof(int), musaMemcpyDeviceToHost, dev_ctx.stream()); #else phi::backends::gpu::GpuMemcpyAsync( &num, nums_ptr, sizeof(int), cudaMemcpyDeviceToHost, dev_ctx.stream()); @@ -179,6 +182,9 @@ void DeserializeOnGPU(const phi::GPUContext& dev_ctx, #ifdef PADDLE_WITH_HIP phi::backends::gpu::GpuMemcpySync( &numel, strings_data, sizeof(numel), hipMemcpyDeviceToHost); +#elif defined(PADDLE_WITH_MUSA) + phi::backends::gpu::GpuMemcpySync( + &numel, strings_data, sizeof(numel), musaMemcpyDeviceToHost); #else phi::backends::gpu::GpuMemcpySync( &numel, strings_data, sizeof(numel), cudaMemcpyDeviceToHost); diff --git a/paddle/phi/kernels/strings/unicode.cc b/paddle/phi/kernels/strings/unicode.cc index 75e48f1ce982e..e39fcdc0181a6 100644 --- a/paddle/phi/kernels/strings/unicode.cc +++ b/paddle/phi/kernels/strings/unicode.cc @@ -57,6 +57,10 @@ const uint8_t* GetGPUUniflagMap() { hipMalloc(reinterpret_cast(&gpu_uniflag), size); phi::backends::gpu::GpuMemcpySync( gpu_uniflag, cpu_uniflag, size, hipMemcpyHostToDevice); +#elif defined(PADDLE_WITH_MUSA) + musaMalloc(reinterpret_cast(&gpu_uniflag), size); + phi::backends::gpu::GpuMemcpySync( + gpu_uniflag, cpu_uniflag, size, musaMemcpyHostToDevice); #else cudaMalloc(reinterpret_cast(&gpu_uniflag), size); phi::backends::gpu::GpuMemcpySync( @@ -76,6 +80,10 @@ const uint16_t* GetGPUCharcasesMap() { hipMalloc(reinterpret_cast(&gpu_charcases), size); phi::backends::gpu::GpuMemcpySync( gpu_charcases, cpu_charcases, size, hipMemcpyHostToDevice); +#elif defined(PADDLE_WITH_MUSA) + musaMalloc(reinterpret_cast(&gpu_charcases), size); + phi::backends::gpu::GpuMemcpySync( + gpu_charcases, cpu_charcases, size, musaMemcpyHostToDevice); #else cudaMalloc(reinterpret_cast(&gpu_charcases), size); phi::backends::gpu::GpuMemcpySync( From c3133ecb9f5c53c0de704f6c1d9a2c7a2ff26f9c Mon Sep 17 00:00:00 2001 From: CaiZhi Date: Wed, 26 Jul 2023 20:07:31 +0800 Subject: [PATCH 09/55] [MTAI] feat(build): fix compiling error for MUSA --- cmake/configure.cmake | 5 + paddle/fluid/platform/device/gpu/gpu_types.h | 9 +- .../platform/device/gpu/musa/musa_helper.h | 0 paddle/fluid/platform/device_context.h | 14 +- paddle/fluid/platform/dynload/mublas.h | 0 paddle/fluid/platform/dynload/mudnn.h | 0 paddle/fluid/platform/dynload/musa_driver.cc | 24 +++ paddle/fluid/platform/dynload/musa_driver.h | 26 +++ paddle/fluid/platform/dynload/musartc.cc | 24 +++ paddle/fluid/platform/dynload/musartc.h | 26 +++ paddle/fluid/platform/dynload/musparse.h | 0 paddle/fluid/platform/enforce.h | 4 +- paddle/phi/api/include/tensor.h | 4 +- paddle/phi/api/lib/tensor_utils.cc | 8 + paddle/phi/backends/device_code.cc | 110 ++--------- paddle/phi/backends/device_code.h | 4 +- paddle/phi/backends/dynload/mublas.h | 6 + paddle/phi/backends/dynload/mudnn.cc | 24 +++ paddle/phi/backends/dynload/mudnn.h | 28 +++ paddle/phi/backends/dynload/murand.h | 0 paddle/phi/backends/dynload/musa_driver.cc | 24 +++ paddle/phi/backends/dynload/musa_driver.h | 24 +++ paddle/phi/backends/dynload/musartc.cc | 24 +++ paddle/phi/backends/dynload/musartc.h | 24 +++ paddle/phi/backends/dynload/musparse.h | 0 paddle/phi/backends/gpu/forwards.h | 5 + paddle/phi/backends/gpu/gpu_context.cc | 22 +-- paddle/phi/backends/gpu/gpu_decls.h | 34 ++-- paddle/phi/backends/gpu/gpu_dnn.h | 3 +- paddle/phi/backends/gpu/gpu_launch_config.h | 2 + paddle/phi/backends/gpu/gpu_resources.cc | 50 ++--- paddle/phi/backends/gpu/gpu_types.h | 51 ++--- paddle/phi/backends/gpu/musa/musa_helper.h | 0 paddle/phi/backends/gpu/musa/musa_info.cc | 6 +- paddle/phi/backends/musartc.h | 24 +++ paddle/phi/core/enforce.h | 181 +++++++++++++++++- paddle/phi/kernels/CMakeLists.txt | 8 +- paddle/phi/kernels/funcs/CMakeLists.txt | 5 +- paddle/phi/kernels/impl/warpctc_kernel_impl.h | 2 +- .../phi/kernels/impl/warprnnt_kernel_impl.h | 2 +- 40 files changed, 599 insertions(+), 208 deletions(-) create mode 100644 paddle/fluid/platform/device/gpu/musa/musa_helper.h create mode 100644 paddle/fluid/platform/dynload/mublas.h create mode 100644 paddle/fluid/platform/dynload/mudnn.h create mode 100644 paddle/fluid/platform/dynload/musa_driver.cc create mode 100644 paddle/fluid/platform/dynload/musa_driver.h create mode 100644 paddle/fluid/platform/dynload/musartc.cc create mode 100644 paddle/fluid/platform/dynload/musartc.h create mode 100644 paddle/fluid/platform/dynload/musparse.h create mode 100644 paddle/phi/backends/dynload/mublas.h create mode 100644 paddle/phi/backends/dynload/mudnn.cc create mode 100644 paddle/phi/backends/dynload/mudnn.h create mode 100644 paddle/phi/backends/dynload/murand.h create mode 100644 paddle/phi/backends/dynload/musa_driver.cc create mode 100644 paddle/phi/backends/dynload/musa_driver.h create mode 100644 paddle/phi/backends/dynload/musartc.cc create mode 100644 paddle/phi/backends/dynload/musartc.h create mode 100644 paddle/phi/backends/dynload/musparse.h create mode 100644 paddle/phi/backends/gpu/musa/musa_helper.h create mode 100644 paddle/phi/backends/musartc.h diff --git a/cmake/configure.cmake b/cmake/configure.cmake index dc661fce388fe..7a9e3ebdd5fde 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -175,6 +175,11 @@ elseif(WITH_ROCM) if(${MIOPEN_VERSION} VERSION_LESS 2090) message(FATAL_ERROR "Paddle needs MIOPEN >= 2.9 to compile") endif() +elseif(WITH_MUSA) + add_definitions(-DPADDLE_WITH_MUSA) + add_definitions(-DEIGEN_USE_GPU) + add_definitions(-DEIGEN_USE_MUSA) + list(APPEND DEPENDENT_INCLUDE_DIRS "/usr/local/musa/include/") else() add_definitions(-DHPPL_STUB_FUNC) list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu) diff --git a/paddle/fluid/platform/device/gpu/gpu_types.h b/paddle/fluid/platform/device/gpu/gpu_types.h index dac2add9f82c1..ba7b1ede735fe 100644 --- a/paddle/fluid/platform/device/gpu/gpu_types.h +++ b/paddle/fluid/platform/device/gpu/gpu_types.h @@ -25,6 +25,7 @@ #elif defined(PADDLE_WITH_MUSA) #include +#include //TODO(Xiaokang Shang) #else #include @@ -51,11 +52,12 @@ namespace paddle { DECLARE_TYPE_FOR_GPU(gpuStream_t, cudaStream_t, hipStream_t, musaStream_t); DECLARE_TYPE_FOR_GPU(gpuError_t, cudaError_t, hipError_t, musaError_t); -DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t, musaEvent_T); +DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t, musaEvent_t); DECLARE_TYPE_FOR_GPU(gpuMemcpyKind, cudaMemcpyKind, hipMemcpyKind, musaMemcpyKind); -DECLARE_TYPE_FOR_GPU(gpuDeviceProp, cudaDeviceProp, hipDeviceProp_t, musaDeviceProp_t); +DECLARE_TYPE_FOR_GPU(gpuDeviceProp, cudaDeviceProp, hipDeviceProp_t, musaDeviceProp); // TODO(Xiaokang Shang): confirm mudnn type +#if 0 DECLARE_TYPE_FOR_GPU(dnnDataType_t, cudnnDataType_t, miopenDataType_t, mudnnDataType_t); DECLARE_TYPE_FOR_GPU(dnnActivationDescriptor, cudnnActivationStruct, @@ -90,12 +92,13 @@ DECLARE_TYPE_FOR_GPU(dnnDropoutDescriptor_t, cudnnDropoutDescriptor_t, miopenDropoutDescriptor_t); DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t, mudnnHandle_t); +#endif DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle, mublasHandle_t); // TODO(Ming Huang): Since there is no blasLt handler, // use rocblas_handle for workround. -DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle); +//DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle); using CUDAGraphID = unsigned long long; // NOLINT diff --git a/paddle/fluid/platform/device/gpu/musa/musa_helper.h b/paddle/fluid/platform/device/gpu/musa/musa_helper.h new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 2aa336486308d..8d26ec716504d 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -42,6 +42,18 @@ limitations under the License. */ #include "paddle/fluid/platform/device/gpu/gpu_info.h" #endif +#ifdef PADDLE_WITH_MUSA +#include "paddle/fluid/platform/device/gpu/gpu_helper.h" +#include "paddle/fluid/platform/dynload/mublas.h" +#include "paddle/fluid/platform/dynload/mudnn.h" +#include "paddle/fluid/platform/dynload/musparse.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#if !defined(__APPLE__) && defined(PADDLE_WITH_MCCL) +#include "paddle/fluid/platform/dynload/mccl.h" +#endif +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#endif + #ifdef PADDLE_WITH_HIP #include "paddle/fluid/platform/device/gpu/gpu_helper.h" // NOLINT #include "paddle/fluid/platform/dynload/miopen.h" @@ -73,7 +85,7 @@ limitations under the License. */ #include "paddle/phi/backends/stream.h" #if !defined(PADDLE_WITH_XPU_KP) || defined(__xpu_on_host__) -#include "unsupported/Eigen/CXX11/Tensor" +//#include "unsupported/Eigen/CXX11/Tensor" #endif namespace Eigen { diff --git a/paddle/fluid/platform/dynload/mublas.h b/paddle/fluid/platform/dynload/mublas.h new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/paddle/fluid/platform/dynload/mudnn.h b/paddle/fluid/platform/dynload/mudnn.h new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/paddle/fluid/platform/dynload/musa_driver.cc b/paddle/fluid/platform/dynload/musa_driver.cc new file mode 100644 index 0000000000000..2015bbed28cbd --- /dev/null +++ b/paddle/fluid/platform/dynload/musa_driver.cc @@ -0,0 +1,24 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +namespace paddle { +namespace platform { +namespace dynload { + +bool HasCUDADriver() { return false; } + +} // namespace dynload +} // namespace platform +} // namespace paddle + diff --git a/paddle/fluid/platform/dynload/musa_driver.h b/paddle/fluid/platform/dynload/musa_driver.h new file mode 100644 index 0000000000000..a55f0bd70f967 --- /dev/null +++ b/paddle/fluid/platform/dynload/musa_driver.h @@ -0,0 +1,26 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +namespace paddle { +namespace platform { +namespace dynload { + +extern bool HasCUDADriver(); + +} // namespace dynload +} // namespace platform +} // namespace paddle + diff --git a/paddle/fluid/platform/dynload/musartc.cc b/paddle/fluid/platform/dynload/musartc.cc new file mode 100644 index 0000000000000..5bc7b6737b3fb --- /dev/null +++ b/paddle/fluid/platform/dynload/musartc.cc @@ -0,0 +1,24 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +namespace paddle { +namespace platform { +namespace dynload { + +bool HasNVRTC() { return false; } + +} // namespace dynload +} // namespace platform +} // namespace paddle + diff --git a/paddle/fluid/platform/dynload/musartc.h b/paddle/fluid/platform/dynload/musartc.h new file mode 100644 index 0000000000000..a81254119de57 --- /dev/null +++ b/paddle/fluid/platform/dynload/musartc.h @@ -0,0 +1,26 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +namespace paddle { +namespace platform { +namespace dynload { + +extern bool HasNVRTC(); + +} // namespace dynload +} // namespace platform +} // namespace paddle + diff --git a/paddle/fluid/platform/dynload/musparse.h b/paddle/fluid/platform/dynload/musparse.h new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 160d6fb9912cb..72771dafe62fc 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -42,8 +42,8 @@ limitations under the License. */ #include #include #include -#include -#include +//#include +//#include #include #include #endif // PADDLE_WITH_MUSA diff --git a/paddle/phi/api/include/tensor.h b/paddle/phi/api/include/tensor.h index b2c687a1f448d..ab7c298288d9d 100644 --- a/paddle/phi/api/include/tensor.h +++ b/paddle/phi/api/include/tensor.h @@ -25,8 +25,8 @@ using gpuStream_t = cudaStream_t; #endif #ifdef PADDLE_WITH_HIP -#include -using gpuStream_t = hipStream_t; +//#include +//using gpuStream_t = hipStream_t; #endif #ifdef PADDLE_WITH_MUSA diff --git a/paddle/phi/api/lib/tensor_utils.cc b/paddle/phi/api/lib/tensor_utils.cc index 3384b59158703..c96cf57f1ce6c 100644 --- a/paddle/phi/api/lib/tensor_utils.cc +++ b/paddle/phi/api/lib/tensor_utils.cc @@ -20,6 +20,8 @@ limitations under the License. */ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) #ifdef PADDLE_WITH_CUDA #include +#elif defined(PADDLE_WITH_MUSA) +#include #else #include #endif @@ -43,6 +45,12 @@ phi::Place GetPlaceFromPtr(void* data) { phi::errors::Unimplemented("The GetPlaceFromPtr() method is only " "supported when CUDA version >= 10.0.")); #endif +#elif defined(PADDLE_WITH_MUSA) + musaPointerAttributes attr; + musaError_t status = musaPointerGetAttributes(&attr, data); + if (status == musaSuccess && attr.type == musaMemoryTypeDevice) { + return phi::GPUPlace(attr.device); + } #else hipPointerAttribute_t attr; hipError_t status = hipPointerGetAttributes(&attr, data); diff --git a/paddle/phi/backends/device_code.cc b/paddle/phi/backends/device_code.cc index 529e42fc4c95b..33b8f3a320aac 100644 --- a/paddle/phi/backends/device_code.cc +++ b/paddle/phi/backends/device_code.cc @@ -24,7 +24,9 @@ limitations under the License. */ #include "paddle/phi/backends/context_pool.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/flags.h" - +#ifdef PADDLE_WITH_MUSA +#include +#endif PHI_DECLARE_string(cuda_dir); namespace phi { @@ -107,7 +109,7 @@ static bool CheckCUDADriverResult(MUresult result, std::string kernel_name = "") { if (result != MUSA_SUCCESS) { const char* error = nullptr; - dynload::muGetErrorString(result, &error); + muGetErrorString(result, &error); #else static bool CheckCUDADriverResult(CUresult result, std::string caller, @@ -138,7 +140,7 @@ void GPUDeviceCode::CheckAvailableStatus() { hiprtcResult nvrtc_result = dynload::hiprtcVersion(&nvrtc_major, &nvrtc_minor); #elif defined(PADDLE_WITH_MUSA) - nvrtcResult nvrtc_result = dynload::nvrtcVersion(&nvrtc_major, &nvrtc_minor); + #else nvrtcResult nvrtc_result = dynload::nvrtcVersion(&nvrtc_major, &nvrtc_minor); #endif @@ -150,7 +152,7 @@ void GPUDeviceCode::CheckAvailableStatus() { hipError_t driver_result = dynload::hipDriverGetVersion(&driver_version); if (driver_result == hipSuccess) { #elif defined(PADDLE_WITH_MUSA) - MUresult driver_result = dynload::muDriverGetVersion(&driver_version); + MUresult driver_result = muDriverGetVersion(&driver_version); if (driver_result == MUSA_SUCCESS) { #else CUresult driver_result = dynload::cuDriverGetVersion(&driver_version); @@ -166,7 +168,7 @@ void GPUDeviceCode::CheckAvailableStatus() { #ifdef PADDLE_WITH_HIP if (nvrtc_result != HIPRTC_SUCCESS || driver_result != hipSuccess) { #elif defined(PADDLE_WITH_MUSA) - if (nvrtc_result != NVRTC_SUCCESS || driver_result != MUSA_SUCCESS) { + if (false) { #else if (nvrtc_result != NVRTC_SUCCESS || driver_result != CUDA_SUCCESS) { #endif @@ -178,7 +180,7 @@ void GPUDeviceCode::CheckAvailableStatus() { if (CheckCUDADriverResult(dynload::hipGetDeviceCount(&count), "hipGetDeviceCount")) { #elif defined(PADDLE_WITH_MUSA) - if (CheckCUDADriverResult(dynload::muDeviceGetCount(&count), + if (CheckCUDADriverResult(muDeviceGetCount(&count), "muDeviceGetCount")) { #else if (CheckCUDADriverResult(dynload::cuDeviceGetCount(&count), @@ -340,85 +342,10 @@ bool GPUDeviceCode::Compile(bool include_path) { return false; } #elif defined(PADDLE_WITH_MUSA) - nvrtcProgram program; - if (!CheckNVRTCResult(dynload::nvrtcCreateProgram(&program, - kernel_.c_str(), // buffer - name_.c_str(), // name - 0, // numHeaders - nullptr, // headers - nullptr), // includeNames - "nvrtcCreateProgram")) { - return false; - } - - // Compile the program for specified compute_capability auto* dev_ctx = reinterpret_cast( DeviceContextPool::Instance().Get(place_)); - int compute_capability = dev_ctx->GetComputeCapability(); - std::string compute_flag = - "--gpu-architecture=compute_" + std::to_string(compute_capability); - std::vector options = {"--std=c++11", compute_flag.c_str()}; - std::string include_option; - if (include_path) { - std::string cuda_include_path = FindMUSAIncludePath(); - if (!cuda_include_path.empty()) { - include_option = "--include-path=" + cuda_include_path; - options.push_back(include_option.c_str()); - } - } - nvrtcResult compile_result = - dynload::nvrtcCompileProgram(program, // program - options.size(), // numOptions - options.data()); // options - if (compile_result == NVRTC_ERROR_COMPILATION) { - // Obtain compilation log from the program - size_t log_size; - if (!CheckNVRTCResult(dynload::nvrtcGetProgramLogSize(program, &log_size), - "nvrtcGetProgramLogSize")) { - return false; - } - std::vector log; - log.resize(log_size + 1); - if (!CheckNVRTCResult(dynload::nvrtcGetProgramLog(program, log.data()), - "nvrtcGetProgramLog")) { - return false; - } - LOG(WARNING) << "JIT compiling of CUDA code failed:" - << "\n Kernel name: " << name_ << "\n Kernel body:\n" - << kernel_ << "\n Compiling log: " << log.data(); - - return false; - } - - // Obtain PTX from the program - size_t ptx_size; - if (!CheckNVRTCResult(dynload::nvrtcGetPTXSize(program, &ptx_size), - "nvrtcGetPTXSize")) { - return false; - } - ptx_.resize(ptx_size + 1); - if (!CheckNVRTCResult(dynload::nvrtcGetPTX(program, ptx_.data()), - "nvrtcGetPTX")) { - return false; - } - - if (!CheckNVRTCResult(dynload::nvrtcDestroyProgram(&program), - "nvrtcDestroyProgram")) { - return false; - } - - if (!CheckCUDADriverResult(dynload::muModuleLoadData(&module_, ptx_.data()), - "muModuleLoadData", - name_)) { - return false; - } - - if (!CheckCUDADriverResult( - dynload::muModuleGetFunction(&function_, module_, name_.c_str()), - "muModuleGetFunction", - name_)) { - return false; - } + is_compiled_ = false; + return false; #else nvrtcProgram program; if (!CheckNVRTCResult(dynload::nvrtcCreateProgram(&program, @@ -539,7 +466,7 @@ void GPUDeviceCode::Launch(const size_t n, std::vector* args) const { name_.c_str())); #elif defined(PADDLE_WITH_MUSA) PADDLE_ENFORCE_EQ( - dynload::muLaunchKernel(function_, + muLaunchKernel(function_, num_blocks, 1, 1, // grid dim @@ -581,15 +508,10 @@ bool GPUDeviceCode::CheckNVRTCResult(hiprtcResult result, << " > failed: " << dynload::hiprtcGetErrorString(result); return false; } -#elif defined(PADDLE_WITH_MUSA) -bool GPUDeviceCode::CheckNVRTCResult(nvrtcResult result, std::string function) { - if (result != NVRTC_SUCCESS) { - LOG_FIRST_N(WARNING, 1) - << "Call " << function << " for < " << name_ - << " > failed: " << dynload::nvrtcGetErrorString(result); - return false; - } -#else + return true; +} +#endif +#ifdef PADDLE_WITH_CUDA bool GPUDeviceCode::CheckNVRTCResult(nvrtcResult result, std::string function) { if (result != NVRTC_SUCCESS) { LOG_FIRST_N(WARNING, 1) @@ -597,9 +519,9 @@ bool GPUDeviceCode::CheckNVRTCResult(nvrtcResult result, std::string function) { << " > failed: " << dynload::nvrtcGetErrorString(result); return false; } -#endif return true; } #endif +#endif } // namespace phi diff --git a/paddle/phi/backends/device_code.h b/paddle/phi/backends/device_code.h index 63d221ea8c89a..5721f8f04768e 100644 --- a/paddle/phi/backends/device_code.h +++ b/paddle/phi/backends/device_code.h @@ -27,8 +27,8 @@ limitations under the License. */ #include "paddle/phi/backends/dynload/nvrtc.h" #endif #ifdef PADDLE_WITH_MUSA +#include "paddle/phi/backends/dynload/musartc.h" #include "paddle/phi/backends/dynload/musa_driver.h" -#include "paddle/phi/backends/dynload/nvrtc.h" #endif #ifdef PADDLE_WITH_HIP #include "paddle/phi/backends/dynload/hiprtc.h" @@ -73,7 +73,7 @@ class GPUDeviceCode : public DeviceCode { #ifdef PADDLE_WITH_HIP bool CheckNVRTCResult(hiprtcResult result, std::string function); #elif defined(PADDLE_WITH_MUSA) - bool CheckNVRTCResult(cudartcResult result, std::string function); + #else bool CheckNVRTCResult(nvrtcResult result, std::string function); #endif diff --git a/paddle/phi/backends/dynload/mublas.h b/paddle/phi/backends/dynload/mublas.h new file mode 100644 index 0000000000000..bbba96fa497a2 --- /dev/null +++ b/paddle/phi/backends/dynload/mublas.h @@ -0,0 +1,6 @@ + +#include +namespace phi { +namespace dynload { +} // namespace dynload +} // namespace phi diff --git a/paddle/phi/backends/dynload/mudnn.cc b/paddle/phi/backends/dynload/mudnn.cc new file mode 100644 index 0000000000000..19ada8408ed17 --- /dev/null +++ b/paddle/phi/backends/dynload/mudnn.cc @@ -0,0 +1,24 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +namespace phi { +namespace dynload { + +bool HasCUDNN() { + return false; +} + +} // namespace dynload +} // namespace phi + diff --git a/paddle/phi/backends/dynload/mudnn.h b/paddle/phi/backends/dynload/mudnn.h new file mode 100644 index 0000000000000..c96a2570210d2 --- /dev/null +++ b/paddle/phi/backends/dynload/mudnn.h @@ -0,0 +1,28 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#ifdef PADDLE_WITH_MUSA + +namespace phi { +namespace dynload { + +extern bool HasCUDNN(); + + +} // namespace dynload +} // namespace phi + +#endif + diff --git a/paddle/phi/backends/dynload/murand.h b/paddle/phi/backends/dynload/murand.h new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/paddle/phi/backends/dynload/musa_driver.cc b/paddle/phi/backends/dynload/musa_driver.cc new file mode 100644 index 0000000000000..009dda42ceebf --- /dev/null +++ b/paddle/phi/backends/dynload/musa_driver.cc @@ -0,0 +1,24 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +namespace phi { +namespace dynload { + +bool HasCUDADriver() { + return false; +} + +} // namespace dynload +} // namespace phi + diff --git a/paddle/phi/backends/dynload/musa_driver.h b/paddle/phi/backends/dynload/musa_driver.h new file mode 100644 index 0000000000000..1363d135d5f7e --- /dev/null +++ b/paddle/phi/backends/dynload/musa_driver.h @@ -0,0 +1,24 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +namespace phi { +namespace dynload { + +extern bool HasCUDADriver(); + +} // namespace dynload +} // namespace phi + diff --git a/paddle/phi/backends/dynload/musartc.cc b/paddle/phi/backends/dynload/musartc.cc new file mode 100644 index 0000000000000..cf14ae70a01a1 --- /dev/null +++ b/paddle/phi/backends/dynload/musartc.cc @@ -0,0 +1,24 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + + +namespace phi { +namespace dynload { + +bool HasNVRTC() { + return false; +} + +} // namespace dynload +} // namespace phi diff --git a/paddle/phi/backends/dynload/musartc.h b/paddle/phi/backends/dynload/musartc.h new file mode 100644 index 0000000000000..dc9ebc3faf0d7 --- /dev/null +++ b/paddle/phi/backends/dynload/musartc.h @@ -0,0 +1,24 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +namespace phi { +namespace dynload { + +extern bool HasNVRTC(); + +} // namespace dynload +} // namespace phi + diff --git a/paddle/phi/backends/dynload/musparse.h b/paddle/phi/backends/dynload/musparse.h new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/paddle/phi/backends/gpu/forwards.h b/paddle/phi/backends/gpu/forwards.h index e1f3492f76870..7a66475b13aa9 100644 --- a/paddle/phi/backends/gpu/forwards.h +++ b/paddle/phi/backends/gpu/forwards.h @@ -72,6 +72,11 @@ using cufftHandle = int; // Forward declaration of NCCL types. using ncclComm_t = struct ncclComm *; +// Forward declaration of MUSA runtime types. +using musaStream_t = struct MUstream_st *; +using musaEvent_t = struct MUevent_st *; +using mublasHandle_t = struct _mublasHandle_t*; + /// Forward declaration of ROCM types. #include diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc index e954c7db337aa..2bcf665b8fd61 100644 --- a/paddle/phi/backends/gpu/gpu_context.cc +++ b/paddle/phi/backends/gpu/gpu_context.cc @@ -46,7 +46,6 @@ limitations under the License. */ #ifdef PADDLE_WITH_MUSA #include "paddle/phi/backends/dynload/mublas.h" #include "paddle/phi/backends/dynload/mudnn.h" -#include "paddle/phi/backends/dynload/musolver.h" #include "paddle/phi/backends/dynload/musparse.h" #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) #include "paddle/phi/backends/dynload/mccl.h" @@ -62,10 +61,6 @@ limitations under the License. */ #endif // !defined(__APPLE__) && defined(PADDLE_WITH_RCCL) #endif // PADDLE_WITH_HIP -#ifdef PADDLE_WITH_MUSA - -#endif - // NOTE: The paddle framework should add WITH_EIGEN option to support compile // without eigen. #include "unsupported/Eigen/CXX11/Tensor" @@ -164,10 +159,10 @@ static void StreamCallbackFunc(gpuStream_t stream, #ifdef PADDLE_WITH_MUSA #if MUSA_VERSION >= 10000 - static void MUDART_CB StreamCallbackFunc(void* user_data) + static void StreamCallbackFunc(void* user_data) #else - static void MUDART_CB - StreamCallbackFunc(musaStream_t stream, musaError_t status, void* user_data) + static void + StreamCallbackFunc(cudaStream_t stream, cudaError_t status, void* user_data) #endif #endif @@ -497,10 +492,7 @@ struct GPUContext::Impl { dnn_handle_ = nullptr; } #elif defined(PADDLE_WITH_MUSA) - if (owned_ && dnn_handle_ != nullptr) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnDestroy(dnn_handle_)); - dnn_handle_ = nullptr; - } + #else if (owned_ && dnn_handle_ != nullptr) { PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDestroy(dnn_handle_)); @@ -578,7 +570,7 @@ struct GPUContext::Impl { } #endif // !defined(_WIN32) -#else // PADDLE_WITH_HIP +#else // PADDLE_WITH_MUSA cudaError_t e_sync = cudaSuccess; #if !defined(_WIN32) e_sync = cudaStreamSynchronize(stream()); @@ -588,7 +580,7 @@ struct GPUContext::Impl { break; } #endif // !defined(_WIN32) -#endif // PADDLE_WITH_HIP +#endif // PADDLE_WITH_CUDA PADDLE_ENFORCE_GPU_SUCCESS(e_sync); } @@ -775,7 +767,7 @@ struct GPUContext::Impl { } void WaitStreamCallback() const { -#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_CUDA) +#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_MUSA) phi::backends::gpu::GpuStreamSync(stream()); #endif { diff --git a/paddle/phi/backends/gpu/gpu_decls.h b/paddle/phi/backends/gpu/gpu_decls.h index 93dba9764478a..d6f42ff743e58 100644 --- a/paddle/phi/backends/gpu/gpu_decls.h +++ b/paddle/phi/backends/gpu/gpu_decls.h @@ -20,22 +20,25 @@ namespace phi { #ifdef PADDLE_WITH_HIP -#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \ +#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \ using GPU_TYPE = ROCM_TYPE; #elif defined(PADDLE_WITH_MUSA) - -#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \ + using mudnnHandle_t = bool**; + using mublasLtHandle_t = bool**; + using musparseHandle_t = bool**; + using musolverDnHandle_t = bool**; +#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \ using GPU_TYPE = MUSA_TYPE; -#else // PADDLE_WITH_CDUA -#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \ +#else +#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \ using GPU_TYPE = CUDA_TYPE; -#endif - -DECLARE_TYPE_FOR_GPU(gpuStream_t, cudaStream_t, hipStream_t); -DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t); +#endif // PADDLE_WITH_CDUA +DECLARE_TYPE_FOR_GPU(gpuStream_t, cudaStream_t, hipStream_t, musaStream_t); +DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t, musaEvent_t); +#if 0 DECLARE_TYPE_FOR_GPU(dnnActivationDescriptor, cudnnActivationStruct, miopenActivationDescriptor); @@ -60,18 +63,17 @@ DECLARE_TYPE_FOR_GPU(dnnPoolingDescriptor_t, DECLARE_TYPE_FOR_GPU(dnnDropoutDescriptor_t, cudnnDropoutDescriptor_t, miopenDropoutDescriptor_t); -DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t); - -DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle); +#endif // TODO(Ming Huang): Since there is no blasLt handler, // use rocblas_handle for workround. -DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle); - -DECLARE_TYPE_FOR_GPU(solverHandle_t, cusolverDnHandle_t, rocsolver_handle); +DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle, mublasLtHandle_t); -DECLARE_TYPE_FOR_GPU(sparseHandle_t, cusparseHandle_t, rocsparse_handle); +DECLARE_TYPE_FOR_GPU(solverHandle_t, cusolverDnHandle_t, rocsolver_handle, musolverDnHandle_t); +DECLARE_TYPE_FOR_GPU(sparseHandle_t, cusparseHandle_t, rocsparse_handle, musparseHandle_t); +DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t, mudnnHandle_t); +DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle, mublasHandle_t); #undef DECLARE_TYPE_FOR_GPU using CUDAGraphID = unsigned long long; // NOLINT diff --git a/paddle/phi/backends/gpu/gpu_dnn.h b/paddle/phi/backends/gpu/gpu_dnn.h index 30cf3fae80519..dfb13e29dbf89 100644 --- a/paddle/phi/backends/gpu/gpu_dnn.h +++ b/paddle/phi/backends/gpu/gpu_dnn.h @@ -20,8 +20,7 @@ #include "paddle/phi/backends/gpu/rocm/miopen_desc.h" #include "paddle/phi/backends/gpu/rocm/miopen_helper.h" #elif defined(PADDLE_WITH_MUSA) -#include "paddle/phi/backends/gpu/musa/mudnn_desc.h" -#include "paddle/phi/backends/gpu/musa/mudnn_helper.h" + #else // CUDA #include "paddle/phi/backends/gpu/cuda/cudnn_desc.h" #include "paddle/phi/backends/gpu/cuda/cudnn_helper.h" diff --git a/paddle/phi/backends/gpu/gpu_launch_config.h b/paddle/phi/backends/gpu/gpu_launch_config.h index 00aa244041bec..675353e011498 100644 --- a/paddle/phi/backends/gpu/gpu_launch_config.h +++ b/paddle/phi/backends/gpu/gpu_launch_config.h @@ -20,6 +20,8 @@ #ifdef PADDLE_WITH_CUDA #include +#elif defined(PADDLE_WITH_MUSA) +#include #else #include #endif diff --git a/paddle/phi/backends/gpu/gpu_resources.cc b/paddle/phi/backends/gpu/gpu_resources.cc index b60d0cccd3dc5..e13d318942e06 100644 --- a/paddle/phi/backends/gpu/gpu_resources.cc +++ b/paddle/phi/backends/gpu/gpu_resources.cc @@ -37,9 +37,7 @@ #ifdef PADDLE_WITH_MUSA #include "paddle/phi/backends/dynload/mublas.h" -#include "paddle/phi/backends/dynload/mublasLt.h" #include "paddle/phi/backends/dynload/mudnn.h" -#include "paddle/phi/backends/dynload/musolver.h" #include "paddle/phi/backends/dynload/musparse.h" #if !defined(__APPLE__) && defined(PADDLE_WITH_MCCL) #include "paddle/phi/backends/dynload/mccl.h" @@ -158,7 +156,8 @@ void InitGpuProperties(Place place, "version."; } #elif defined(PADDLE_WITH_MUSA) - size_t mudnn_dso_ver = dynload::mudnnGetVersion(); + //size_t mudnn_dso_ver = dynload::mudnnGetVersion(); + size_t mudnn_dso_ver = 0; LOG_FIRST_N(WARNING, 1) << "device: " << static_cast(place.device) << ", muDNN Version: " << mudnn_dso_ver / 1000 << "." << (mudnn_dso_ver % 1000) / 100 << "."; @@ -184,15 +183,15 @@ void InitGpuProperties(Place place, local_musa_version / 10, mudnn_dso_ver / 1000)); #endif - if (local_cuda_version < compile_cuda_version) { + if (local_musa_version < compile_musa_version) { LOG_FIRST_N(WARNING, 1) << "WARNING: device: " << static_cast(place.device) - << ". The installed Paddle is compiled with CUDA " - << compile_cuda_version / 10 << "." << compile_cuda_version % 10 - << ", but CUDA runtime version in your machine is " - << local_cuda_version / 10 << "." << local_cuda_version % 10 + << ". The installed Paddle is compiled with MUSA " + << compile_musa_version / 10 << "." << compile_musa_version % 10 + << ", but MUSA runtime version in your machine is " + << local_musa_version / 10 << "." << local_musa_version % 10 << ", which may cause serious incompatible bug. " - << "Please recompile or reinstall Paddle with compatible CUDA " + << "Please recompile or reinstall Paddle with compatible MUSA " "version."; } #else @@ -267,9 +266,9 @@ void InitBlasHandle(blasHandle_t* blas_handle, gpuStream_t stream) { phi::dynload::rocblas_create_handle(blas_handle); phi::dynload::rocblas_set_stream(*blas_handle, stream); #elif defined(PADDLE_WITH_MUSA) - PADDLE_RETRY_MUSA_SUCCESS(phi::dynload::mublasCreate(blas_handle)); - PADDLE_RETRY_MUSA_SUCCESS( - phi::dynload::mublasSetStream(*blas_handle, stream)); + PADDLE_RETRY_CUDA_SUCCESS(mublasCreate(blas_handle)); + PADDLE_RETRY_CUDA_SUCCESS( + mublasSetStream(*blas_handle, stream)); #else // PADDLE_WITH_MUSA PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasCreate(blas_handle)); PADDLE_RETRY_CUDA_SUCCESS( @@ -285,7 +284,7 @@ void DestroyBlasHandle(blasHandle_t handle) { } #elif defined(PADDLE_WITH_MUSA) if (handle != nullptr) { - phi::dynload::mublasDestroy(handle); + mublasDestroy(handle); handle = nullptr; } #else @@ -334,21 +333,7 @@ void InitDnnHandle(dnnHandle_t* handle, gpuStream_t stream, Place place) { PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreate(handle)); PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetStream(*handle, stream)); #elif defined(PADDLE_WITH_MUSA) - auto local_cudnn_version = phi::dynload::mudnnGetVersion() / 100; - auto compile_mudnn_version = MUDNN_VERSION / 100; - if (local_mudnn_version < static_cast(compile_mudnn_version)) { - LOG_FIRST_N(WARNING, 1) - << "WARNING: device: " << place.device - << ". The installed Paddle is compiled with MUDNN " - << compile_mudnn_version / 10 << "." << compile_mudnn_version % 10 - << ", but MUDNN version in your machine is " - << local_mudnn_version / 10 << "." << local_mudnn_version % 10 - << ", which may cause serious incompatible bug. " - << "Please recompile or reinstall Paddle with compatible MUDNN " - "version."; - } - PADDLE_RETRY_MUSA_SUCCESS(phi::dynload::mudnnCreate(handle)); - PADDLE_RETRY_MUSA_SUCCESS(phi::dynload::mudnnSetStream(*handle, stream)); + #else auto local_cudnn_version = phi::dynload::cudnnGetVersion() / 100; auto compile_cudnn_version = CUDNN_VERSION / 100; @@ -378,10 +363,7 @@ void DestroyDnnHandle(dnnHandle_t handle) { handle = nullptr; } #elif defined(PADDLE_WITH_MUSA) - if (handle != nullptr) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnDestroy(handle)); - handle = nullptr; - } + #else if (handle != nullptr) { PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDestroy(handle)); @@ -391,14 +373,14 @@ void DestroyDnnHandle(dnnHandle_t handle) { } void InitSolverHandle(solverHandle_t* handle, gpuStream_t stream) { -#ifndef PADDLE_WITH_HIP +#ifdef PADDLE_WITH_CUDA PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cusolverDnCreate(handle)); PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cusolverDnSetStream(*handle, stream)); #endif } void DestroySolverHandle(solverHandle_t solver_handle) { -#ifndef PADDLE_WITH_HIP +#ifdef PADDLE_WITH_CUDA if (solver_handle != nullptr) { PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnDestroy(solver_handle)); solver_handle = nullptr; diff --git a/paddle/phi/backends/gpu/gpu_types.h b/paddle/phi/backends/gpu/gpu_types.h index 36e094f4a0814..21230d6b22701 100644 --- a/paddle/phi/backends/gpu/gpu_types.h +++ b/paddle/phi/backends/gpu/gpu_types.h @@ -25,6 +25,7 @@ #elif defined(PADDLE_WITH_MUSA) #include "paddle/phi/backends/dynload/mublas.h" #include "paddle/phi/backends/dynload/mudnn.h" +#include #else // PADDLE_WITH_CUDA #include "paddle/phi/backends/dynload/cublas.h" #include "paddle/phi/backends/dynload/cudnn.h" @@ -33,57 +34,61 @@ namespace phi { #ifdef PADDLE_WITH_HIP -#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \ +#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \ using GPU_TYPE = ROCM_TYPE; #elif defined(PADDLE_WITH_MUSA) -#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \ +#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \ using GPU_TYPE = MUSA_TYPE; #else // PADDLE_WITH_CDUA -#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \ +#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \ using GPU_TYPE = CUDA_TYPE; #endif -DECLARE_TYPE_FOR_GPU(gpuError_t, cudaError_t, hipError_t); -DECLARE_TYPE_FOR_GPU(gpuMemcpyKind, cudaMemcpyKind, hipMemcpyKind); -DECLARE_TYPE_FOR_GPU(gpuDeviceProp, cudaDeviceProp, hipDeviceProp_t); -DECLARE_TYPE_FOR_GPU(dnnDataType_t, cudnnDataType_t, miopenDataType_t); -DECLARE_TYPE_FOR_GPU(dnnPoolingMode_t, cudnnPoolingMode_t, miopenPoolingMode_t); -DECLARE_TYPE_FOR_GPU(dnnTensorFormat_t, - cudnnTensorFormat_t, - miopenTensorFormat_t); -DECLARE_TYPE_FOR_GPU(dnnActivationMode_t, - cudnnActivationMode_t, - miopenActivationMode_t); +DECLARE_TYPE_FOR_GPU(gpuError_t, cudaError_t, hipError_t, musaError_t); +DECLARE_TYPE_FOR_GPU(gpuMemcpyKind, cudaMemcpyKind, hipMemcpyKind, musaMemcpyKind); +DECLARE_TYPE_FOR_GPU(gpuDeviceProp, cudaDeviceProp, hipDeviceProp_t, musaDeviceProp); +//DECLARE_TYPE_FOR_GPU(dnnDataType_t, cudnnDataType_t, miopenDataType_t); +//DECLARE_TYPE_FOR_GPU(dnnPoolingMode_t, cudnnPoolingMode_t, miopenPoolingMode_t); +//DECLARE_TYPE_FOR_GPU(dnnTensorFormat_t, +// cudnnTensorFormat_t, +// miopenTensorFormat_t); +//DECLARE_TYPE_FOR_GPU(dnnActivationMode_t, +// cudnnActivationMode_t, +// miopenActivationMode_t); #undef DECLARE_TYPE_FOR_GPU #ifdef PADDLE_WITH_HIP -#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \ +#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \ constexpr auto GPU_CV = ROCM_CV; #elif defined(PADDLE_WITH_MUSA) -#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \ +#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \ constexpr auto GPU_CV = MUSA_CV; #else // PADDLE_WITH_CUDA -#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \ +#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \ constexpr auto GPU_CV = CUDA_CV; #endif DECLARE_CONSTANT_FOR_GPU(gpuErrorOutOfMemory, cudaErrorMemoryAllocation, - hipErrorOutOfMemory); -DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady, cudaErrorNotReady, hipErrorNotReady); -DECLARE_CONSTANT_FOR_GPU(gpuSuccess, cudaSuccess, hipSuccess); + hipErrorOutOfMemory, + musaErrorMemoryAllocation); +DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady, cudaErrorNotReady, hipErrorNotReady, musaErrorNotReady); +DECLARE_CONSTANT_FOR_GPU(gpuSuccess, cudaSuccess, hipSuccess, musaSuccess); DECLARE_CONSTANT_FOR_GPU(gpuMemcpyHostToDevice, cudaMemcpyKind::cudaMemcpyHostToDevice, - hipMemcpyKind::hipMemcpyHostToDevice); + hipMemcpyKind::hipMemcpyHostToDevice, + musaMemcpyKind::musaMemcpyHostToDevice); DECLARE_CONSTANT_FOR_GPU(gpuMemcpyDeviceToHost, cudaMemcpyKind::cudaMemcpyDeviceToHost, - hipMemcpyKind::hipMemcpyDeviceToHost); + hipMemcpyKind::hipMemcpyDeviceToHost, + musaMemcpyKind::musaMemcpyDeviceToHost); DECLARE_CONSTANT_FOR_GPU(gpuMemcpyDeviceToDevice, cudaMemcpyKind::cudaMemcpyDeviceToDevice, - hipMemcpyKind::hipMemcpyDeviceToDevice); + hipMemcpyKind::hipMemcpyDeviceToDevice, + musaMemcpyKind::musaMemcpyDeviceToDevice); #undef DECLARE_CONSTANT_FOR_GPU } // namespace phi diff --git a/paddle/phi/backends/gpu/musa/musa_helper.h b/paddle/phi/backends/gpu/musa/musa_helper.h new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/paddle/phi/backends/gpu/musa/musa_info.cc b/paddle/phi/backends/gpu/musa/musa_info.cc index f2087e4d7f4fc..ced106d6c6b3d 100644 --- a/paddle/phi/backends/gpu/musa/musa_info.cc +++ b/paddle/phi/backends/gpu/musa/musa_info.cc @@ -13,12 +13,14 @@ // limitations under the License. #include +#include #include "paddle/phi/backends/gpu/gpu_info.h" +#include "paddle/fluid/framework/fleet/heter_ps/log_patch.h" #include "paddle/phi/core/enforce.h" -#include "musa_runtime_api.h" +#include "musa_runtime.h" static std::once_flag g_device_props_size_init_flag; static std::vector> g_device_props_init_flags; @@ -29,7 +31,7 @@ namespace backends { namespace gpu { int DnnVersion() { - return 0.0.0; + return 0; //if (!dynload::HasCUDNN()) return -1; //size_t version_major, version_minor, version_patch; //PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenGetVersion( diff --git a/paddle/phi/backends/musartc.h b/paddle/phi/backends/musartc.h new file mode 100644 index 0000000000000..dc9ebc3faf0d7 --- /dev/null +++ b/paddle/phi/backends/musartc.h @@ -0,0 +1,24 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +namespace phi { +namespace dynload { + +extern bool HasNVRTC(); + +} // namespace dynload +} // namespace phi + diff --git a/paddle/phi/core/enforce.h b/paddle/phi/core/enforce.h index cda5a3a49c528..aaa3eebfe27a5 100644 --- a/paddle/phi/core/enforce.h +++ b/paddle/phi/core/enforce.h @@ -36,11 +36,11 @@ limitations under the License. */ #endif // PADDLE_WITH_CUDA #ifdef PADDLE_WITH_MUSA -#include +#include #include #include -#include -#include +//#include +//#include #include #include #endif // PADDLE_WITH_MUSA @@ -90,7 +90,6 @@ limitations under the License. */ #include "paddle/phi/backends/dynload/mublas.h" #include "paddle/phi/backends/dynload/mudnn.h" #include "paddle/phi/backends/dynload/murand.h" -#include "paddle/phi/backends/dynload/musolver.h" #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) #include @@ -854,6 +853,180 @@ inline void retry_sleep(unsigned milliseconds) { #undef DEFINE_EXTERNAL_API_TYPE #endif // PADDLE_WITH_CUDA +/************************************************************************/ +/**************************** MUSA ERROR ********************************/ +#ifdef PADDLE_WITH_MUSA + +namespace details { + +template +struct ExternalApiType {}; + +#define DEFINE_EXTERNAL_API_TYPE(type, success_value) \ + template <> \ + struct ExternalApiType { \ + using Type = type; \ + static constexpr Type kSuccess = success_value; \ + } + +DEFINE_EXTERNAL_API_TYPE(musaError_t, musaSuccess); +//DEFINE_EXTERNAL_API_TYPE(murandStatus_t, MURAND_STATUS_SUCCESS); +//DEFINE_EXTERNAL_API_TYPE(cudnnStatus_t, CUDNN_STATUS_SUCCESS); +DEFINE_EXTERNAL_API_TYPE(mublasStatus_t, MUBLAS_STATUS_SUCCESS); +//DEFINE_EXTERNAL_API_TYPE(cusparseStatus_t, CUSPARSE_STATUS_SUCCESS); +//DEFINE_EXTERNAL_API_TYPE(cusolverStatus_t, CUSOLVER_STATUS_SUCCESS); +//DEFINE_EXTERNAL_API_TYPE(cufftResult_t, CUFFT_SUCCESS); +//DEFINE_EXTERNAL_API_TYPE(CUresult, CUDA_SUCCESS); + +#if !defined(__APPLE__) && defined(PADDLE_WITH_MCCL) +DEFINE_EXTERNAL_API_TYPE(mcclResult_t, mcclSuccess); +#endif + +} // namespace details + +/*************** MUSA ERROR ***************/ +inline bool is_error(musaError_t e) { return e != musaSuccess; } + +inline std::string build_musa_error_msg(musaError_t e) { + std::ostringstream sout; + sout << "MUSA error(" << e << "), " << musaGetErrorString(e) << ". "; + return sout.str(); +} + +///*************** MURAND ERROR ***************/ +//inline bool is_error(murandStatus_t stat) { +// return stat != MURAND_STATUS_SUCCESS; +//} +// +//inline std::string build_musa_error_msg(murandStatus_t stat) { +// std::ostringstream sout; +// sout << "MURAND error(" << stat << "). " << GetExternalErrorMsg(stat); +// return sout.str(); +//} + +/*************** MUBLAS ERROR ***************/ +inline bool is_error(mublasStatus_t stat) { + return stat != MUBLAS_STATUS_SUCCESS; +} + +inline std::string build_musa_error_msg(mublasStatus_t stat) { + std::ostringstream sout; + sout << "MUBLAS error(" << stat << "). "; + return sout.str(); +} + +///*************** CUSPARSE ERROR ***************/ +//inline bool is_error(cusparseStatus_t stat) { +// return stat != CUSPARSE_STATUS_SUCCESS; +//} +// +//inline std::string build_musa_error_msg(cusparseStatus_t stat) { +// std::ostringstream sout; +// sout << "CUSparse error(" << stat << "). " << GetExternalErrorMsg(stat); +// return sout.str(); +//} + +/**************** MCCL ERROR ****************/ +#if !defined(__APPLE__) && defined(PADDLE_WITH_MCCL) +inline bool is_error(mcclResult_t mccl_result) { + return mccl_result != mcclSuccess; +} + +inline std::string build_musa_error_msg(mcclResult_t mccl_result) { + std::ostringstream sout; + sout << "MCCL error(" << mccl_result << "), " + << phi::dynload::mcclGetErrorString(mccl_result) << ". "; + if (errno == ENOSPC || errno == EAGAIN) { + std::string detail(strerror(errno)); + detail += "\nPlease try one of the following solutions:"; + detail += "\n1. export MCCL_SHM_DISABLE=1;"; + detail += "\n2. export MCCL_P2P_LEVEL=SYS;"; + detail += + "\n3. Increase shared memory by setting the -shm-size " + "option when starting docker container, e.g., setting " + " -shm-size=2g.\n"; + sout << " Detail: " + detail; + } + return sout.str(); +} +#endif // not(__APPLE__) and PADDLE_WITH_MCCL + +#define PADDLE_ENFORCE_GPU_SUCCESS(COND) \ + do { \ + auto __cond__ = (COND); \ + using __CUDA_STATUS_TYPE__ = decltype(__cond__); \ + constexpr auto __success_type__ = \ + ::phi::enforce::details::ExternalApiType< \ + __CUDA_STATUS_TYPE__>::kSuccess; \ + if (UNLIKELY(__cond__ != __success_type__)) { \ + auto __summary__ = phi::errors::External( \ + ::phi::enforce::build_musa_error_msg(__cond__)); \ + __THROW_ERROR_INTERNAL__(__summary__); \ + } \ + } while (0) + +#define PADDLE_WARN_GPU_SUCCESS(COND) \ + do { \ + auto __cond__ = (COND); \ + using __CUDA_STATUS_TYPE__ = decltype(__cond__); \ + constexpr auto __success_type__ = \ + ::phi::enforce::details::ExternalApiType< \ + __CUDA_STATUS_TYPE__>::kSuccess; \ + if (UNLIKELY(__cond__ != __success_type__)) { \ + ::phi::enforce::ThrowWarnInternal( \ + ::phi::enforce::build_musa_error_msg(__cond__)); \ + } \ + } while (0) + +#define PADDLE_ENFORCE_CUDA_LAUNCH_SUCCESS(OP) \ + do { \ + auto res = musaGetLastError(); \ + if (UNLIKELY(res != musaSuccess)) { \ + auto msg = ::phi::enforce::build_musa_error_msg(res); \ + PADDLE_THROW( \ + phi::errors::Fatal("MUSA error after kernel (%s): %s", OP, msg)); \ + } \ + } while (0) + +inline void retry_sleep(unsigned milliseconds) { +#ifdef _WIN32 + Sleep(milliseconds); +#else + if (milliseconds < 1000) { + // usleep argument must be less than 1,000,000. Reference: + // https://pubs.opengroup.org/onlinepubs/7908799/xsh/usleep.html + usleep(milliseconds * 1000); + } else { + // clip to sleep in seconds because we can not and don't have to + // sleep for exact milliseconds + sleep(milliseconds / 1000); + } +#endif +} + +#define PADDLE_RETRY_CUDA_SUCCESS(COND) \ + do { \ + auto __cond__ = (COND); \ + int retry_count = 1; \ + using __CUDA_STATUS_TYPE__ = decltype(__cond__); \ + constexpr auto __success_type__ = \ + ::phi::enforce::details::ExternalApiType< \ + __CUDA_STATUS_TYPE__>::kSuccess; \ + while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \ + phi::enforce::retry_sleep(10000); \ + __cond__ = (COND); \ + ++retry_count; \ + } \ + if (UNLIKELY(__cond__ != __success_type__)) { \ + auto __summary__ = phi::errors::External( \ + ::phi::enforce::build_musa_error_msg(__cond__)); \ + __THROW_ERROR_INTERNAL__(__summary__); \ + } \ + } while (0) + +#undef DEFINE_EXTERNAL_API_TYPE +#endif // PADDLE_WITH_MUSA + /**************************************************************************/ /***************************** HIP ERROR **********************************/ #ifdef PADDLE_WITH_HIP diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index e28210cfca7e4..4b21e61f8d88c 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -117,10 +117,10 @@ file( "xpu/*.cc" "legacy/xpu/*.cc" "selected_rows/xpu/*.cc" "fusion/xpu/*.cc" "sparse/xpu/*.cc") -if(WITH_GPU OR WITH_ROCM OR WITH_MUSA) - collect_srcs(kernels_srcs SRCS ${kernel_cu}) - kernel_declare("${kernel_cu}") -endif() +#if(WITH_GPU OR WITH_ROCM OR WITH_MUSA) +# collect_srcs(kernels_srcs SRCS ${kernel_cu}) +# kernel_declare("${kernel_cu}") +#endif() if(WITH_XPU) if(WITH_XPU_KP) diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt index c4bdf29e03949..f90147b013023 100644 --- a/paddle/phi/kernels/funcs/CMakeLists.txt +++ b/paddle/phi/kernels/funcs/CMakeLists.txt @@ -1,4 +1,4 @@ -add_subdirectory(eigen) +#add_subdirectory(eigen) add_subdirectory(blas) add_subdirectory(lapack) add_subdirectory(detail) @@ -15,4 +15,5 @@ if(WITH_GPU OR WITH_ROCM OR WITH_MUSA) "*.cu") endif() -collect_srcs(kernels_srcs SRCS ${func_cc_srcs} ${func_cu_srcs}) +#collect_srcs(kernels_srcs SRCS ${func_cc_srcs} ${func_cu_srcs}) +collect_srcs(kernels_srcs SRCS ${func_cc_srcs}) diff --git a/paddle/phi/kernels/impl/warpctc_kernel_impl.h b/paddle/phi/kernels/impl/warpctc_kernel_impl.h index 015c7a0764a2b..4b4bd6f5143dd 100644 --- a/paddle/phi/kernels/impl/warpctc_kernel_impl.h +++ b/paddle/phi/kernels/impl/warpctc_kernel_impl.h @@ -205,7 +205,7 @@ class WarpCTCFunctor { warpctc_version_ = phi::dynload::get_warpctc_version(); if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) options_.loc = CTC_GPU; options_.stream = reinterpret_cast(dev_ctx).stream(); diff --git a/paddle/phi/kernels/impl/warprnnt_kernel_impl.h b/paddle/phi/kernels/impl/warprnnt_kernel_impl.h index f36ec9c007eda..bc12e17ae55fb 100644 --- a/paddle/phi/kernels/impl/warprnnt_kernel_impl.h +++ b/paddle/phi/kernels/impl/warprnnt_kernel_impl.h @@ -208,7 +208,7 @@ class WarpRNNTFunctor { options_.batch_first = true; if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) options_.loc = RNNT_GPU; options_.stream = reinterpret_cast(dev_ctx).stream(); From 50b52384069e0e1f202150bfbea79c7a0251d8a9 Mon Sep 17 00:00:00 2001 From: CaiZhi Date: Sat, 29 Jul 2023 17:03:52 +0800 Subject: [PATCH 10/55] [MTAI] feat(build): fix building error for musa backend --- cmake/configure.cmake | 1 + cmake/generic.cmake | 17 +- cmake/musa.cmake | 3 +- paddle/fluid/framework/data_type.h | 1 - paddle/fluid/framework/data_type_transform.cc | 2 +- paddle/fluid/framework/details/CMakeLists.txt | 62 +++- paddle/fluid/framework/ir/CMakeLists.txt | 4 +- paddle/fluid/framework/ir/fuse_bn_act_pass.cc | 2 +- .../conv_affine_channel_mkldnn_fuse_pass.cc | 2 + .../framework/ir/mkldnn/cpu_quantize_pass.cc | 3 +- paddle/fluid/framework/var_type_traits.cc | 8 + .../fluid/imperative/gradient_accumulator.cc | 18 +- .../inference/api/.resource_manager.h.swp | Bin 0 -> 16384 bytes .../allocation/cuda_managed_allocator.h | 1 + .../memory/allocation/pinned_allocator.cc | 4 +- .../allocation/stream_safe_cuda_allocator.cc | 15 + .../allocation/stream_safe_cuda_allocator.h | 3 + .../memory/allocation/system_allocator.cc | 13 +- paddle/fluid/operators/affine_channel_op.cu | 4 + paddle/fluid/operators/batch_norm_op.cu | 3 + .../fluid/operators/detection/bbox_util.cu.h | 3 + .../detection/collect_fpn_proposals_op.cu | 3 + .../elementwise/elementwise_op_function.h | 18 +- paddle/fluid/operators/expand_as_op.cc | 2 +- paddle/fluid/operators/expand_op.cc | 2 +- .../fluid/operators/fused/attn_bias_add.cu.h | 3 + .../fused_fc_elementwise_layernorm_op.cu | 3 + .../fluid/operators/fused_token_prune_op.cu | 3 + .../get_tensor_from_selected_rows_op.cc | 2 +- paddle/fluid/operators/hinge_loss_op.cc | 2 +- paddle/fluid/operators/im2sequence_op.cc | 2 +- paddle/fluid/operators/isfinite_op.h | 8 +- paddle/fluid/operators/l1_norm_op.cc | 2 +- paddle/fluid/operators/load_op.cc | 2 +- paddle/fluid/operators/math/inclusive_scan.h | 3 + paddle/fluid/operators/matmul_op.cc | 14 +- paddle/fluid/operators/minus_op.cc | 2 +- paddle/fluid/operators/mudnn_rnn_cache.h | 33 ++ paddle/fluid/operators/nop_op.cc | 2 +- .../operators/optimizers/sparse_momentum_op.h | 5 +- .../fluid/operators/pad_constant_like_op.cc | 2 +- paddle/fluid/operators/prroi_pool_op.h | 8 +- paddle/fluid/operators/rank_loss_op.cc | 2 +- paddle/fluid/operators/reshape_op.cc | 2 +- paddle/fluid/operators/save_op.cc | 2 +- .../sequence_ops/sequence_softmax_op.cu | 4 + paddle/fluid/operators/svd_helper.h | 2 +- .../fluid/operators/sync_batch_norm_utils.h | 3 + paddle/fluid/operators/top_k_op.cu | 3 + paddle/fluid/operators/uniform_random_op.h | 4 +- paddle/fluid/platform/CMakeLists.txt | 30 +- paddle/fluid/platform/device/CMakeLists.txt | 2 +- .../fluid/platform/device/gpu/CMakeLists.txt | 6 + paddle/fluid/platform/device/gpu/gpu_dnn.h | 2 +- .../platform/device/gpu/gpu_resource_pool.cc | 2 +- paddle/fluid/platform/device/gpu/gpu_types.h | 5 +- paddle/fluid/platform/dynload/CMakeLists.txt | 12 + paddle/fluid/platform/profiler.cc | 2 +- .../fluid/platform/stream_callback_manager.cc | 15 +- paddle/phi/CMakeLists.txt | 9 + paddle/phi/backends/device_code.cc | 140 ++++---- paddle/phi/backends/dynload/CMakeLists.txt | 12 + paddle/phi/backends/gpu/gpu_decls.h | 1 + paddle/phi/backends/gpu/gpu_primitives.h | 308 +++++++++--------- .../phi/backends/gpu/musa/.musa_info.cc.swp | Bin 0 -> 4096 bytes .../backends/gpu/musa/musa_device_function.h | 190 +++++++++++ paddle/phi/backends/gpu/musa/musa_helper.h | 34 ++ paddle/phi/common/.float16.h.swp | Bin 0 -> 16384 bytes paddle/phi/common/bfloat16.h | 19 +- paddle/phi/common/complex.h | 36 +- paddle/phi/common/cpstring_impl.h | 6 +- paddle/phi/common/float16.h | 31 +- paddle/phi/common/scalar.h | 16 +- paddle/phi/common/transform.h | 17 +- paddle/phi/core/enforce.h | 11 + paddle/phi/core/hostdevice.h | 6 +- paddle/phi/core/macros.h | 2 +- paddle/phi/core/visit_type.h | 8 - paddle/phi/kernels/CMakeLists.txt | 24 +- paddle/phi/kernels/activation_kernel.cc | 2 +- paddle/phi/kernels/assign_kernel.cc | 2 +- paddle/phi/kernels/batch_norm_kernel.cc | 2 +- .../kernels/check_memory_continue_kernel.cc | 2 +- paddle/phi/kernels/coalesce_tensor_kernel.cc | 2 +- .../phi/kernels/cpu/activation_grad_kernel.cc | 2 +- paddle/phi/kernels/cpu/activation_kernel.cc | 32 +- paddle/phi/kernels/cpu/cast_grad_kernel.cc | 6 +- paddle/phi/kernels/dist_grad_kernel.cc | 2 +- paddle/phi/kernels/empty_kernel.cc | 2 +- paddle/phi/kernels/flatten_grad_kernel.cc | 2 +- paddle/phi/kernels/flatten_kernel.cc | 2 +- paddle/phi/kernels/full_kernel.cc | 2 +- paddle/phi/kernels/funcs/.im2col.cu.swp | Bin 0 -> 16384 bytes paddle/phi/kernels/funcs/CMakeLists.txt | 2 +- paddle/phi/kernels/funcs/activation_functor.h | 2 +- paddle/phi/kernels/funcs/algorithm.h | 4 +- paddle/phi/kernels/funcs/broadcast_function.h | 4 +- .../kernels/funcs/concat_and_split_functor.cu | 4 + paddle/phi/kernels/funcs/cross_entropy.cu | 4 +- paddle/phi/kernels/funcs/diagonal.h | 6 +- .../phi/kernels/funcs/distribution_helper.h | 72 +++- .../phi/kernels/funcs/eigen/.extensions.h.swp | Bin 0 -> 16384 bytes paddle/phi/kernels/funcs/eigen/.slice.cu.swp | Bin 0 -> 12288 bytes paddle/phi/kernels/funcs/eigen/erf.cc | 6 +- paddle/phi/kernels/funcs/eigen/extensions.h | 3 +- paddle/phi/kernels/funcs/eigen/pad.cu | 4 +- paddle/phi/kernels/funcs/eigen/slice.cu | 4 +- paddle/phi/kernels/funcs/elementwise_base.h | 6 +- .../phi/kernels/funcs/elementwise_functor.h | 2 +- .../phi/kernels/funcs/elementwise_grad_base.h | 4 +- paddle/phi/kernels/funcs/fft.cu | 3 +- paddle/phi/kernels/funcs/fft_fill_conj.h | 4 +- paddle/phi/kernels/funcs/for_range.h | 2 +- .../kernels/funcs/gather_scatter_functor.cu | 2 +- .../kernels/funcs/gather_scatter_functor.h | 2 - paddle/phi/kernels/funcs/im2col.cu | 14 +- paddle/phi/kernels/funcs/inclusive_scan.h | 3 + paddle/phi/kernels/funcs/index_calculator.h | 2 +- paddle/phi/kernels/funcs/index_put_utils.h | 7 +- .../phi/kernels/funcs/interpolate_function.h | 4 +- paddle/phi/kernels/funcs/isfinite_functor.h | 6 +- paddle/phi/kernels/funcs/layer_norm_impl.cu.h | 3 + paddle/phi/kernels/funcs/math_function.cc | 2 +- paddle/phi/kernels/funcs/mode.h | 4 +- paddle/phi/kernels/funcs/mufft_util.h | 0 paddle/phi/kernels/funcs/norm_utils.cu.h | 3 + paddle/phi/kernels/funcs/random.cuh | 3 + paddle/phi/kernels/funcs/reduce_function.h | 8 +- paddle/phi/kernels/funcs/segment_pooling.cu | 19 +- paddle/phi/kernels/funcs/select_impl.cu.h | 3 + paddle/phi/kernels/funcs/softmax.cu | 8 +- paddle/phi/kernels/funcs/squared_l2_norm.h | 6 +- .../phi/kernels/funcs/top_k_function_cuda.h | 3 + .../gpu/fused_bn_activation_grad_kernel.cu | 4 + .../fusion/gpu/fused_bn_activation_kernel.cu | 4 + paddle/phi/kernels/gpu/.auc_kernel.cu.swp | Bin 0 -> 4096 bytes paddle/phi/kernels/gpu/arg_min_max_kernel.cu | 5 +- paddle/phi/kernels/gpu/argsort_grad_kernel.cu | 3 + paddle/phi/kernels/gpu/argsort_kernel.cu | 3 + paddle/phi/kernels/gpu/auc_kernel.cu | 17 + .../phi/kernels/gpu/batch_norm_grad_kernel.cu | 18 + paddle/phi/kernels/gpu/batch_norm_kernel.cu | 241 +++++++------- paddle/phi/kernels/gpu/bernoulli_kernel.cu | 3 + .../phi/kernels/gpu/check_numerics_kernel.cu | 6 + paddle/phi/kernels/gpu/cholesky_kernel.cu | 4 +- .../phi/kernels/gpu/cholesky_solve_kernel.cu | 4 +- .../kernels/gpu/cross_entropy_grad_kernel.cu | 13 +- .../phi/kernels/gpu/cross_entropy_kernel.cu | 10 + .../phi/kernels/gpu/cudnn_lstm_grad_kernel.cu | 4 +- paddle/phi/kernels/gpu/cudnn_lstm_kernel.cu | 3 +- paddle/phi/kernels/gpu/cum_grad_kernel.cu | 3 + paddle/phi/kernels/gpu/cum_kernel.cu | 5 + paddle/phi/kernels/gpu/cumprod_grad_kernel.cu | 2 + paddle/phi/kernels/gpu/decode_jpeg_kernel.cu | 2 +- paddle/phi/kernels/gpu/depthwise_conv.h | 3 + .../gpu/distribute_fpn_proposals_kernel.cu | 3 + .../kernels/gpu/generate_proposals_kernel.cu | 3 + .../phi/kernels/gpu/graph_reindex_kernel.cu | 3 + paddle/phi/kernels/gpu/group_norm_utils.h | 3 + .../phi/kernels/gpu/gumbel_softmax_kernel.cu | 5 +- .../kernels/gpu/instance_norm_grad_kernel.cu | 17 +- .../phi/kernels/gpu/instance_norm_kernel.cu | 38 +-- paddle/phi/kernels/gpu/instance_norm_utils.h | 3 + paddle/phi/kernels/gpu/mudnn_lstm_cache.h | 0 paddle/phi/kernels/gpu/multinomial_kernel.cu | 3 + paddle/phi/kernels/gpu/nonzero_kernel.cu | 3 + paddle/phi/kernels/gpu/norm_grad_kernel.cu | 3 + paddle/phi/kernels/gpu/norm_kernel.cu | 3 + paddle/phi/kernels/gpu/poisson_kernel.cu | 3 + paddle/phi/kernels/gpu/randperm_kernel.cu | 9 + .../gpu/sigmoid_cross_entropy_with_logits.h | 3 + .../phi/kernels/gpu/viterbi_decode_kernel.cu | 3 + .../phi/kernels/impl/clip_grad_kernel_impl.h | 4 +- paddle/phi/kernels/impl/clip_kernel_impl.h | 4 +- paddle/phi/kernels/impl/complex_kernel_impl.h | 2 +- paddle/phi/kernels/impl/diag_embed_impl.h | 4 +- .../phi/kernels/impl/dot_grad_kernel_impl.h | 12 +- .../impl/elementwise_grad_kernel_impl.h | 2 +- .../kernels/impl/elementwise_kernel_impl.h | 2 +- .../phi/kernels/impl/fft_grad_kernel_impl.h | 2 +- paddle/phi/kernels/impl/isclose_kernel_impl.h | 2 +- .../phi/kernels/impl/kron_grad_kernel_impl.h | 4 +- paddle/phi/kernels/impl/kron_kernel_impl.h | 4 +- .../kernels/impl/matmul_grad_kernel_impl.h | 4 +- .../phi/kernels/impl/polygamma_kernel_impl.h | 4 +- paddle/phi/kernels/impl/renorm_impl.h | 6 +- .../impl/repeat_interleave_grad_kernel_impl.h | 10 +- .../impl/repeat_interleave_kernel_impl.h | 8 +- .../kernels/impl/sequence_mask_kernel_impl.h | 4 +- .../phi/kernels/impl/solve_grad_kernel_impl.h | 4 +- .../phi/kernels/impl/trace_grad_kernel_impl.h | 4 +- .../kernels/impl/unstack_grad_kernel_impl.h | 4 +- paddle/phi/kernels/impl/unstack_kernel_impl.h | 6 +- paddle/phi/kernels/is_empty_kernel.cc | 2 +- paddle/phi/kernels/memcpy_kernel.cc | 2 +- paddle/phi/kernels/npu_identity_kernel.cc | 2 +- .../kernels/primitive/compute_primitives.h | 3 + paddle/phi/kernels/prod_kernel.cc | 2 +- paddle/phi/kernels/reduce_all_kernel.cc | 2 +- paddle/phi/kernels/reduce_amax_kernel.cc | 2 +- paddle/phi/kernels/reduce_amin_kernel.cc | 2 +- paddle/phi/kernels/reduce_any_kernel.cc | 2 +- paddle/phi/kernels/reduce_mean_kernel.cc | 2 +- paddle/phi/kernels/reduce_min_kernel.cc | 4 +- paddle/phi/kernels/reduce_sum_kernel.cc | 2 +- paddle/phi/kernels/reverse_kernel.cc | 2 +- .../selected_rows/activation_kernel.cc | 2 +- .../kernels/selected_rows/assign_kernel.cc | 2 +- .../elementwise_multiply_kernel.cc | 2 +- .../phi/kernels/selected_rows/full_kernel.cc | 2 +- .../kernels/selected_rows/isfinite_kernel.cc | 2 +- .../merge_selected_rows_kernel.cc | 2 +- .../phi/kernels/selected_rows/scale_kernel.cc | 2 +- .../phi/kernels/selected_rows/shape_kernel.cc | 2 +- .../kernels/selected_rows/uniform_kernel.cc | 2 +- paddle/phi/kernels/shape_kernel.cc | 2 +- .../kernels/sparse/batch_norm_grad_kernel.cc | 2 +- .../phi/kernels/sparse/batch_norm_kernel.cc | 2 +- paddle/phi/kernels/sparse/empty_kernel.cc | 2 +- paddle/phi/kernels/sparse/gpu/conv.cu.h | 3 + .../sparse/sparse_utils_grad_kernel.cc | 2 +- paddle/phi/kernels/squeeze_grad_kernel.cc | 2 +- paddle/phi/kernels/squeeze_kernel.cc | 2 +- .../phi/kernels/strided_slice_grad_kernel.cc | 2 +- paddle/phi/kernels/strided_slice_kernel.cc | 2 +- paddle/phi/kernels/strings/case_utils.h | 2 +- .../strings/gpu/strings_lower_upper_kernel.cu | 3 +- paddle/phi/kernels/transfer_layout_kernel.cc | 2 +- paddle/phi/kernels/unsqueeze_grad_kernel.cc | 2 +- paddle/phi/kernels/unsqueeze_kernel.cc | 2 +- .../fluid/test_leaky_relu_grad_grad_functor.h | 4 +- .../test_strings_lower_upper_dev_api.cu | 2 +- test/custom_op/custom_raw_op_kernel_op.h | 2 +- 233 files changed, 1492 insertions(+), 709 deletions(-) create mode 100644 paddle/fluid/inference/api/.resource_manager.h.swp create mode 100644 paddle/fluid/operators/mudnn_rnn_cache.h create mode 100644 paddle/phi/backends/gpu/musa/.musa_info.cc.swp create mode 100644 paddle/phi/backends/gpu/musa/musa_device_function.h create mode 100644 paddle/phi/common/.float16.h.swp create mode 100644 paddle/phi/kernels/funcs/.im2col.cu.swp create mode 100644 paddle/phi/kernels/funcs/eigen/.extensions.h.swp create mode 100644 paddle/phi/kernels/funcs/eigen/.slice.cu.swp create mode 100644 paddle/phi/kernels/funcs/mufft_util.h create mode 100644 paddle/phi/kernels/gpu/.auc_kernel.cu.swp create mode 100644 paddle/phi/kernels/gpu/mudnn_lstm_cache.h diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 7a9e3ebdd5fde..c1db56de7f728 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -177,6 +177,7 @@ elseif(WITH_ROCM) endif() elseif(WITH_MUSA) add_definitions(-DPADDLE_WITH_MUSA) + #add_definitions(-DEIGEN_USE_THREADS) add_definitions(-DEIGEN_USE_GPU) add_definitions(-DEIGEN_USE_MUSA) list(APPEND DEPENDENT_INCLUDE_DIRS "/usr/local/musa/include/") diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 28aecb580a637..4a255c0902206 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -787,14 +787,13 @@ function(musa_library TARGET_NAME) "${multiValueArgs}" ${ARGN}) if(musa_library_SRCS) # TODO(MTAI): enable compiling static library - #if(musa_library_SHARED OR musa_library_shared) # build *.so - # musa_add_library(${TARGET_NAME} SHARED ${musa_library_SRCS}) - #else() - # musa_add_library(${TARGET_NAME} STATIC ${musa_library_SRCS}) - # find_fluid_modules(${TARGET_NAME}) - # find_phi_modules(${TARGET_NAME}) - #endif() - musa_add_library(${TARGET_NAME} SHARED ${hip_library_SRCS}) + if(musa_library_SHARED OR musa_library_shared) # build *.so + add_library(${TARGET_NAME} SHARED ${musa_library_SRCS}) + else() + add_library(${TARGET_NAME} STATIC ${musa_library_SRCS}) + find_fluid_modules(${TARGET_NAME}) + find_phi_modules(${TARGET_NAME}) + endif() if(musa_library_DEPS) add_dependencies(${TARGET_NAME} ${musa_library_DEPS}) target_link_libraries(${TARGET_NAME} ${musa_library_DEPS}) @@ -830,7 +829,7 @@ function(musa_binary TARGET_NAME) set(multiValueArgs SRCS DEPS) cmake_parse_arguments(musa_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - musa_add_executable(${TARGET_NAME} ${musa_binary_SRCS}) + add_executable(${TARGET_NAME} ${musa_binary_SRCS}) if(musa_binary_DEPS) target_link_libraries(${TARGET_NAME} ${musa_binary_DEPS}) add_dependencies(${TARGET_NAME} ${musa_binary_DEPS}) diff --git a/cmake/musa.cmake b/cmake/musa.cmake index 39245d726d4f9..c6701f33858f8 100644 --- a/cmake/musa.cmake +++ b/cmake/musa.cmake @@ -26,7 +26,8 @@ else() list(APPEND MUSA_MCC_FLAGS -std=c++17) endif() -set(MUSA_VERBOSE_BUILD ON) +list(APPEND MUSA_MCC_FLAGS -U__CUDA__) +#set(MUSA_VERBOSE_BUILD ON) if(CMAKE_BUILD_TYPE MATCHES Debug) list(APPEND MUSA_MCC_FLAGS -g2) list(APPEND MUSA_MCC_FLAGS -O0) diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h index 7e002c8154147..672bac7d329ff 100644 --- a/paddle/fluid/framework/data_type.h +++ b/paddle/fluid/framework/data_type.h @@ -130,7 +130,6 @@ inline void VisitDataType(proto::VarType::Type type, Visitor visitor) { #define VisitDataTypeCallback(cpp_type, proto_type) \ do { \ if (type == proto_type) { \ - visitor.template apply(); \ return; \ } \ } while (0) diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc index 9d114fcf56396..b2fb089f53574 100644 --- a/paddle/fluid/framework/data_type_transform.cc +++ b/paddle/fluid/framework/data_type_transform.cc @@ -101,7 +101,7 @@ struct CastDataType { in_end, out_begin, CastDataTypeFunctor()); -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) } else if (platform::is_gpu_place(in_.place())) { phi::Transform trans; auto* context = static_cast(ctx_); diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 4d9a88cf22372..88c58c24b804f 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -202,6 +202,66 @@ elseif(WITH_ROCM) fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle) +elseif(WITH_MUSA) + musa_library( + nan_inf_utils + SRCS nan_inf_utils_detail.cc + DEPS framework_proto scope place phi) + musa_library( + all_reduce_op_handle + SRCS all_reduce_op_handle.cc + DEPS op_handle_base + scope + lod_tensor + phi + memory + dynload_cuda + variable_visitor) + musa_library( + fused_all_reduce_op_handle + SRCS fused_all_reduce_op_handle.cc + DEPS all_reduce_op_handle + op_handle_base + variable_visitor + scope + lod_tensor + phi + memory + dynload_cuda + place) + musa_library( + grad_merge_all_reduce_op_handle + SRCS grad_merge_all_reduce_op_handle.cc + DEPS fused_all_reduce_op_handle + op_handle_base + scope + lod_tensor + phi + memory + dynload_cuda + variable_visitor + place + all_reduce_op_handle) + + if(WITH_DISTRIBUTE) + musa_library( + reduce_op_handle + SRCS reduce_op_handle.cc + DEPS op_handle_base variable_visitor scope phi dynload_cuda) + else() + musa_library( + reduce_op_handle + SRCS reduce_op_handle.cc + DEPS op_handle_base variable_visitor scope phi dynload_cuda) + endif() + musa_library( + broadcast_op_handle + SRCS broadcast_op_handle.cc + DEPS op_handle_base scope phi memory variable_visitor dynload_cuda) + musa_library( + fused_broadcast_op_handle + SRCS fused_broadcast_op_handle.cc + DEPS broadcast_op_handle) else() cc_library( nan_inf_utils @@ -386,7 +446,7 @@ endif() if(NOT APPLE AND NOT WIN32 - AND (WITH_GPU OR WITH_ROCM)) + AND (WITH_GPU OR WITH_ROCM OR WITH_MUSA)) set(IR_PASS_DEPS ${IR_PASS_DEPS} fusion_group_pass) endif() cc_library( diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 1c186373cdbb5..6f1075c3bf16d 100755 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -3,7 +3,7 @@ add_subdirectory(memory_optimize_pass) add_subdirectory(multi_devices_graph_pass) if(NOT APPLE AND NOT WIN32 - AND (WITH_GPU OR WITH_ROCM)) + AND (WITH_GPU OR WITH_ROCM OR WITH_MUSA)) add_subdirectory(fusion_group) endif() @@ -159,7 +159,7 @@ if(WITH_TENSORRT) pass_library(split_layernorm_to_math_ops_pass inference) endif() -if(WITH_GPU OR WITH_ROCM) +if(WITH_GPU OR WITH_ROCM OR WITH_MUSA) pass_library(cudnn_placement_pass base DEPS placement_pass_base) pass_library(embedding_eltwise_layernorm_fuse_pass inference) endif() diff --git a/paddle/fluid/framework/ir/fuse_bn_act_pass.cc b/paddle/fluid/framework/ir/fuse_bn_act_pass.cc index 322fcb0f7cf48..87cc35b2c3b5f 100644 --- a/paddle/fluid/framework/ir/fuse_bn_act_pass.cc +++ b/paddle/fluid/framework/ir/fuse_bn_act_pass.cc @@ -35,7 +35,7 @@ namespace ir { void FuseBatchNormActPass::ApplyImpl(ir::Graph *graph) const { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) -#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 4, 1) +#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || CUDNN_VERSION_MIN(7, 4, 1) // forward std::unordered_set act_types = {"relu"}; graph = FuseBatchNormAct(graph, act_types); diff --git a/paddle/fluid/framework/ir/mkldnn/conv_affine_channel_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_affine_channel_mkldnn_fuse_pass.cc index 9639d3f374bef..8180c6c02f651 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_affine_channel_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_affine_channel_mkldnn_fuse_pass.cc @@ -54,6 +54,7 @@ void recompute_bias_and_weights(const Scope* scope, const ir::Node& ac_scale, const phi::DenseTensor& ac_bias_tensor, phi::DenseTensor* eltwise_y_in_tensor) { +#if 0 using EigenVectorArrayMap = Eigen::Map>; using ConstEigenVectorArrayMap = @@ -102,6 +103,7 @@ void recompute_bias_and_weights(const Scope* scope, for (int i = 0; i < weights->numel(); ++i) { if (std::fpclassify(weights_data[i]) == FP_SUBNORMAL) weights_data[i] = 0; } +#endif } ConvAffineChannelFusePass::ConvAffineChannelFusePass() { diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc index 7376e87155187..010a8aabdf1eb 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc @@ -11,7 +11,7 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. - +#if 0 #include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h" #include @@ -1328,3 +1328,4 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const { REGISTER_PASS(cpu_quantize_pass, paddle::framework::ir::CPUQuantizePass) .RequirePassAttr("quant_var_scales"); +#endif diff --git a/paddle/fluid/framework/var_type_traits.cc b/paddle/fluid/framework/var_type_traits.cc index d73c9b7d95957..132da0d177178 100644 --- a/paddle/fluid/framework/var_type_traits.cc +++ b/paddle/fluid/framework/var_type_traits.cc @@ -37,6 +37,14 @@ #include "paddle/fluid/operators/miopen_rnn_cache.h" #endif +#ifdef PADDLE_WITH_MUSA +#if defined(PADDLE_WITH_MCCL) +#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" // NOLINT +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" // NOLINT +#endif +#include "paddle/fluid/operators/mudnn_rnn_cache.h" +#endif + #if defined(PADDLE_WITH_XPU_BKCL) #include "paddle/fluid/platform/device/xpu/bkcl_helper.h" #endif diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc index 8c78f7af783dd..8cc764be9ff39 100644 --- a/paddle/fluid/imperative/gradient_accumulator.cc +++ b/paddle/fluid/imperative/gradient_accumulator.cc @@ -204,7 +204,7 @@ void TensorAdd(const VarType& src, VarType* dst) { } if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA) PADDLE_TENSOR_ADD(float, phi::GPUContext); PADDLE_TENSOR_ADD(double, phi::GPUContext); PADDLE_TENSOR_ADD(phi::dtype::float16, phi::GPUContext); @@ -313,7 +313,7 @@ void SelectedRowsAddToTensor(const VarType& src, VarType* dst) { return; \ } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA) if (paddle::platform::is_gpu_place(place)) { PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::GPUContext, float); PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::GPUContext, double); @@ -321,7 +321,7 @@ void SelectedRowsAddToTensor(const VarType& src, VarType* dst) { #endif PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::CPUContext, float); PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::CPUContext, double); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA) } #endif @@ -364,7 +364,7 @@ void SelectedRowsAddTensor(const VarType& src_selected_rows_var, return; \ } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA) if (platform::is_gpu_place(place)) { PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::GPUContext, float); PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::GPUContext, double); @@ -372,7 +372,7 @@ void SelectedRowsAddTensor(const VarType& src_selected_rows_var, #endif PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::CPUContext, float); PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::CPUContext, double); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA) } #endif @@ -425,7 +425,7 @@ std::shared_ptr SelectedRowsMerge(const VarType& src1, return dst_var; \ } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA) if (paddle::platform::is_gpu_place(place)) { PADDLE_SELECTED_ROWS_ADD(phi::GPUContext, float); PADDLE_SELECTED_ROWS_ADD(phi::GPUContext, double); @@ -441,7 +441,7 @@ std::shared_ptr SelectedRowsMerge(const VarType& src1, #if defined(PADDLE_WITH_XPU) } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA) } #endif @@ -712,7 +712,7 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr var, } } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA) if (paddle::platform::is_gpu_place(place)) { // sum selected rows firstly for (auto& var_info : tmp_grad_vars_) { @@ -778,7 +778,7 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr var, // Increase count IncreaseCurCnt(); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA) } #endif tmp_grad_vars_.clear(); diff --git a/paddle/fluid/inference/api/.resource_manager.h.swp b/paddle/fluid/inference/api/.resource_manager.h.swp new file mode 100644 index 0000000000000000000000000000000000000000..b4cebabe4d0f8260883b5fa421aa42bae251d8e0 GIT binary patch literal 16384 zcmeI3Uu+yl9mlr;TIe4Nt&l1ZXzD<*Epr1CbC>1CbC>1CbC>1Cb zC>1CbC>1CbC>1Cb_#afjwBts-CO7 zr#m(6SKyc6yWpGP%fJMm2B*M0I0k0H^>=C7-@sqNGvGeT#_dFqu}Ay1 z7(@%KN4+M!z^sD8HSS|P1*2g`l;m8CYn$uID>#`#dA4kclyMr-B;!l!njL_G7+a(Q zZCWPfA2w5Q?vZLmi zuuRO=k>((Dh7~%QAdPMFv6;7wLvLueapLe?8jkx|QV+T}d_K4Y_qHGRYNwGOpFVRT z3_bONs2xvqu}bRiq58P!RYZ0p&k~q;=EnwOa;i2y9u0)H@OgX<Q0|#dqO>|8=HGaxi2Hd=8#jlm;lcQoRq6d8^=#I~ z(0G;;*_2GgMLMPey<=MEP#o%*h$Ke(3Ts&DPuye`n_A9InsmmU& zSC2<;&@izR(4^k239+F*krzfm;B!H3U1_g3&YYQ@)#GVp7g@06bA3-&%ewuwLy+ow ziXjI_zEYeFnl98_ROom_H`lbduap{&-@J>}=yol8fVw{4$sI9+?AP>`V>;QRxUIO% zsv)QXZuVT0_;~kOL-0DxWx|yAsE|aSO*s*LN|vyM^FmsY7Hvwi#|XNwzJ~#|SeIGE zF}+@d*QQC&$NJS{UXNg<=dw^PxkG}5uyiA*lhw@)a=O00USHX0Ewst%I%%w~G+P_3 z)fId@LFy}K$tPMX%^5-&n!yRCKLmD2a7yN4CZx7%lFhIRWF2T@k2jjIkYo*f4^1Ew z^)_j>C&=-7yVahN4chb^>6+8?5#w;w*&${iXIr!5P;_cbp4IGeiZbEj+9q3^3G{A; zgnn1xdXNH2Px7ZPL{QcBt*xyJR%2G5R(#&m;Z5oo>PxN0!b*GLV{?^5Nu{%Xgw4(l z@jJqVZxd^a0N=VftxX~Q?3-<63Z)N18gjx-& z_pV|2NR_ISW36l^31TdWIkpagk3BRtBv`xK{HUgY(IKmgEvi4I!*uYQk>0Cs7C=v38WIYu~@hIae^T6>pST~lr)p$(XHgMih7(NACs)p zZbNoE-?J@D!Xn}xnI5z?`QK$ZG_t4I(_4dT$(1}k2>kRvoynsF(KTupC+&(nJcI# #include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/platform/place.h" diff --git a/paddle/fluid/memory/allocation/pinned_allocator.cc b/paddle/fluid/memory/allocation/pinned_allocator.cc index 4737e5c565b45..567ec4e4c9461 100644 --- a/paddle/fluid/memory/allocation/pinned_allocator.cc +++ b/paddle/fluid/memory/allocation/pinned_allocator.cc @@ -24,7 +24,7 @@ void CPUPinnedAllocator::FreeImpl(phi::Allocation *allocation) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipHostFree(allocation->ptr())); #elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS(musaHostFree(allocation->ptr())); + PADDLE_ENFORCE_GPU_SUCCESS(musaFreeHost(allocation->ptr())); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaFreeHost(allocation->ptr())); #endif @@ -40,7 +40,7 @@ phi::Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipHostMalloc(&ptr, size, hipHostMallocPortable)); #elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS(musaHostMalloc(&ptr, size, musaHostMallocPortable)); + PADDLE_ENFORCE_GPU_SUCCESS(musaHostAlloc(&ptr, size, musaHostAllocPortable)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaHostAlloc(&ptr, size, cudaHostAllocPortable)); #endif diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc index 9f513448eea26..ae9738ee2afd8 100644 --- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc +++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc @@ -86,6 +86,16 @@ bool StreamSafeCUDAAllocation::CanBeFreed() { } PADDLE_ENFORCE_GPU_SUCCESS(err); PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(event)); +#elif defined(PADDLE_WITH_MUSA) + gpuError_t err = musaEventQuery(event); + if (err == musaErrorNotReady) { + VLOG(9) << "Event " << event << " for " << ptr() << " is not completed"; + // Erase the completded event before "it" + outstanding_event_map_.erase(outstanding_event_map_.begin(), it); + return false; + } + PADDLE_ENFORCE_GPU_SUCCESS(err); + PADDLE_ENFORCE_GPU_SUCCESS(musaEventDestroy(event)); #else gpuError_t err = hipEventQuery(event); if (err == hipErrorNotReady) { @@ -122,6 +132,9 @@ void StreamSafeCUDAAllocation::RecordStreamWithNoGraphCapturing( #ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_GPU_SUCCESS( cudaEventCreateWithFlags(&new_event, cudaEventDisableTiming)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaEventCreateWithFlags(&new_event, musaEventDisableTiming)); #else PADDLE_ENFORCE_GPU_SUCCESS( hipEventCreateWithFlags(&new_event, hipEventDisableTiming)); @@ -136,6 +149,8 @@ void StreamSafeCUDAAllocation::RecordStreamWithNoGraphCapturing( #ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(record_event, stream)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(record_event, stream)); #else PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(record_event, stream)); #endif diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h index 08ecdd4969730..0ab0e932cc6f9 100644 --- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h +++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h @@ -17,6 +17,7 @@ #include #include #include +#include #include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/memory/allocation/spin_lock.h" @@ -24,6 +25,8 @@ #ifdef PADDLE_WITH_CUDA #include +#elif defined(PADDLE_WITH_MUSA) +#include #else #include #endif diff --git a/paddle/fluid/memory/allocation/system_allocator.cc b/paddle/fluid/memory/allocation/system_allocator.cc index d67df333cfaba..2a26ff170ffdf 100644 --- a/paddle/fluid/memory/allocation/system_allocator.cc +++ b/paddle/fluid/memory/allocation/system_allocator.cc @@ -217,7 +217,7 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) { #ifdef PADDLE_WITH_HIP hipError_t result = hipHostMalloc(&p, size, hipHostMallocPortable); #elif defined(PADDLE_WITH_MUSA) - musaError_t result = musaHostMalloc(&p, size, musaHostMallocPortable); + musaError_t result = musaHostAlloc(&p, size, musaHostAllocPortable); #else cudaError_t result = cudaHostAlloc(&p, size, cudaHostAllocPortable); #endif @@ -261,9 +261,18 @@ void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) { platform::errors::Fatal( "hipFreeHost failed in GPUPinnedAllocator, error code is %d", err)); } +#elif defined(PADDLE_WITH_MUSA) + err = musaFreeHost(p); + if (err != musaErrorMusartUnloading) { + PADDLE_ENFORCE_EQ( + err, + 0, + platform::errors::Fatal( + "musaFreeHost failed in GPUPinnedAllocator, error code is %d", + err)); + } #else err = cudaFreeHost(p); - // Purposefully allow cudaErrorCudartUnloading, because // that is returned if you ever call cudaFreeHost after the // driver has already shutdown. This happens only if the diff --git a/paddle/fluid/operators/affine_channel_op.cu b/paddle/fluid/operators/affine_channel_op.cu index 6ec8d77da2c85..62c270bfd0311 100644 --- a/paddle/fluid/operators/affine_channel_op.cu +++ b/paddle/fluid/operators/affine_channel_op.cu @@ -16,6 +16,10 @@ limitations under the License. */ #include "cub/cub.cuh" #endif +#ifdef __MUSACC__ +#include "cub/cub.cuh" +#endif + #ifdef __HIPCC__ #include namespace cub = hipcub; diff --git a/paddle/fluid/operators/batch_norm_op.cu b/paddle/fluid/operators/batch_norm_op.cu index 012edde57294a..1272a83b2b147 100644 --- a/paddle/fluid/operators/batch_norm_op.cu +++ b/paddle/fluid/operators/batch_norm_op.cu @@ -19,6 +19,9 @@ limitations under the License. */ #ifdef __NVCC__ #include "cub/cub.cuh" #endif +#ifdef __MUSACC__ +#include "cub/cub.cuh" +#endif #ifdef __HIPCC__ #include namespace cub = hipcub; diff --git a/paddle/fluid/operators/detection/bbox_util.cu.h b/paddle/fluid/operators/detection/bbox_util.cu.h index adb60a8a8d064..c5ea2218d996e 100644 --- a/paddle/fluid/operators/detection/bbox_util.cu.h +++ b/paddle/fluid/operators/detection/bbox_util.cu.h @@ -19,6 +19,9 @@ limitations under the License. */ #ifdef __NVCC__ #include "cub/cub.cuh" #endif +#ifdef __MUSACC__ +#include "cub/cub.cuh" +#endif #ifdef __HIPCC__ #include namespace cub = hipcub; diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu index b2bbd9c82095c..eba1c5127b8a9 100644 --- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu +++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu @@ -12,6 +12,9 @@ limitations under the License. */ #ifdef __NVCC__ #include "cub/cub.cuh" #endif +#ifdef __MUSACC__ +#include "cub/cub.cuh" +#endif #ifdef __HIPCC__ #include namespace cub = hipcub; diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h index c69acb89750c9..1feb5a5e1fc71 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h @@ -32,9 +32,11 @@ limitations under the License. */ #include "paddle/phi/kernels/cpu/elementwise.h" #include "paddle/phi/kernels/cpu/elementwise_grad.h" -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) #ifdef __NVCC__ #include +#elif defined(__MUSACC__) +#include #elif defined(__HIPCC__) #include #endif @@ -311,7 +313,7 @@ static void FusedElemwiseAndActBroadcast2CPU(const T *x, } } -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) template GetReduceDim(const framework::DDim &in, return phi::funcs::GetReduceDim(in, out, axis); } -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) template void GetGradXAndYOut(const phi::GPUContext &dev_ctx, diff --git a/paddle/fluid/operators/expand_as_op.cc b/paddle/fluid/operators/expand_as_op.cc index f0d31269da193..f1ebaa147494b 100644 --- a/paddle/fluid/operators/expand_as_op.cc +++ b/paddle/fluid/operators/expand_as_op.cc @@ -155,7 +155,7 @@ REGISTER_OP_CPU_KERNEL(expand_as_grad, ops::ExpandAsGradKernel, ops::ExpandAsGradKernel, ops::ExpandAsGradKernel); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA) REGISTER_OP_CUDA_KERNEL(expand_as, ops::ExpandAsKernel, ops::ExpandAsKernel, diff --git a/paddle/fluid/operators/expand_op.cc b/paddle/fluid/operators/expand_op.cc index 490c6f9f6dbfc..54af38ee3d429 100644 --- a/paddle/fluid/operators/expand_op.cc +++ b/paddle/fluid/operators/expand_op.cc @@ -283,7 +283,7 @@ REGISTER_OP_CPU_KERNEL(expand_grad, ops::ExpandGradKernel, ops::ExpandGradKernel, ops::ExpandGradKernel); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA) REGISTER_OP_CUDA_KERNEL( expand, ops::ExpandKernel, diff --git a/paddle/fluid/operators/fused/attn_bias_add.cu.h b/paddle/fluid/operators/fused/attn_bias_add.cu.h index 53001b2493084..b8d66efffee0a 100644 --- a/paddle/fluid/operators/fused/attn_bias_add.cu.h +++ b/paddle/fluid/operators/fused/attn_bias_add.cu.h @@ -17,6 +17,9 @@ limitations under the License. */ #ifdef __NVCC__ #include "cub/cub.cuh" #endif +#ifdef __MUSACC__ +#include "cub/cub.cuh" +#endif #ifdef __HIPCC__ #include namespace cub = hipcub; diff --git a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu index dee676a7640f4..4eea6ab366fb6 100644 --- a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu +++ b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu @@ -15,6 +15,9 @@ limitations under the License. */ #ifdef __NVCC__ #include #endif +#ifdef __MUSACC__ +#include +#endif #ifdef __HIPCC__ #include namespace cub = hipcub; diff --git a/paddle/fluid/operators/fused_token_prune_op.cu b/paddle/fluid/operators/fused_token_prune_op.cu index 8f0a53611f3b2..4ff5fd33df3d6 100644 --- a/paddle/fluid/operators/fused_token_prune_op.cu +++ b/paddle/fluid/operators/fused_token_prune_op.cu @@ -14,6 +14,9 @@ limitations under the License. */ #ifdef __NVCC__ #include #endif +#ifdef __MUSACC__ +#include +#endif #ifdef __HIPCC__ #include namespace cub = hipcub; diff --git a/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc index c6a8a4fe7b982..471428b0b44ee 100644 --- a/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc +++ b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc @@ -111,7 +111,7 @@ PD_REGISTER_STRUCT_KERNEL(get_tensor_from_selected_rows, int, int64_t) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA) PD_REGISTER_STRUCT_KERNEL(get_tensor_from_selected_rows, GPU, ALL_LAYOUT, diff --git a/paddle/fluid/operators/hinge_loss_op.cc b/paddle/fluid/operators/hinge_loss_op.cc index ea38db87e63e7..329e7aa0f0607 100644 --- a/paddle/fluid/operators/hinge_loss_op.cc +++ b/paddle/fluid/operators/hinge_loss_op.cc @@ -156,7 +156,7 @@ PD_REGISTER_STRUCT_KERNEL( PD_REGISTER_STRUCT_KERNEL( hinge_loss_grad, CPU, ALL_LAYOUT, ops::HingeLossGradKernel, float) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA) PD_REGISTER_STRUCT_KERNEL( hinge_loss, GPU, ALL_LAYOUT, ops::HingeLossKernel, float) {} PD_REGISTER_STRUCT_KERNEL( diff --git a/paddle/fluid/operators/im2sequence_op.cc b/paddle/fluid/operators/im2sequence_op.cc index e1e9ca5ef6667..860aee6d9e426 100644 --- a/paddle/fluid/operators/im2sequence_op.cc +++ b/paddle/fluid/operators/im2sequence_op.cc @@ -201,7 +201,7 @@ PD_REGISTER_STRUCT_KERNEL( PD_REGISTER_STRUCT_KERNEL( im2sequence_grad, CPU, ALL_LAYOUT, ops::Im2SequenceGradKernel, float) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA) PD_REGISTER_STRUCT_KERNEL( im2sequence, GPU, ALL_LAYOUT, ops::Im2SequenceKernel, float) {} PD_REGISTER_STRUCT_KERNEL( diff --git a/paddle/fluid/operators/isfinite_op.h b/paddle/fluid/operators/isfinite_op.h index 940b3eaac0c10..42f79646a670a 100644 --- a/paddle/fluid/operators/isfinite_op.h +++ b/paddle/fluid/operators/isfinite_op.h @@ -67,7 +67,7 @@ bool TensorIsfinite(const phi::DenseTensor& tensor); FiniteVisitor(Isnan, Any, CPU); FiniteVisitor(Isinf, Any, CPU); FiniteVisitor(Isfinite, All, CPU); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA) FiniteVisitor(Isnan, Any, GPU); FiniteVisitor(Isinf, Any, GPU); FiniteVisitor(Isfinite, All, GPU); @@ -82,7 +82,7 @@ inline void TensorContainsNAN(const phi::DenseTensor& tensor, IsnanVisitorCPU(tensor, out)); return; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA) if (platform::is_gpu_place(place)) { VisitDataTypeNormal(paddle::framework::TransToProtoVarType(tensor.dtype()), IsnanVisitorGPU(tensor, out)); @@ -99,7 +99,7 @@ inline void TensorContainsInf(const phi::DenseTensor& tensor, IsinfVisitorCPU(tensor, out)); return; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA) if (platform::is_gpu_place(place)) { VisitDataTypeNormal(paddle::framework::TransToProtoVarType(tensor.dtype()), IsinfVisitorGPU(tensor, out)); @@ -116,7 +116,7 @@ inline void TensorIsfinite(const phi::DenseTensor& tensor, IsfiniteVisitorCPU(tensor, out)); return; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA) if (platform::is_gpu_place(place)) { VisitDataTypeNormal(paddle::framework::TransToProtoVarType(tensor.dtype()), IsfiniteVisitorGPU(tensor, out)); diff --git a/paddle/fluid/operators/l1_norm_op.cc b/paddle/fluid/operators/l1_norm_op.cc index 2c6d72f109c13..1cab5b2551b80 100644 --- a/paddle/fluid/operators/l1_norm_op.cc +++ b/paddle/fluid/operators/l1_norm_op.cc @@ -96,7 +96,7 @@ PD_REGISTER_STRUCT_KERNEL(l1_norm, CPU, ALL_LAYOUT, ops::L1NormKernel, float) {} PD_REGISTER_STRUCT_KERNEL( l1_norm_grad, CPU, ALL_LAYOUT, ops::L1NormGradKernel, float) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA) PD_REGISTER_STRUCT_KERNEL(l1_norm, GPU, ALL_LAYOUT, ops::L1NormKernel, float) {} PD_REGISTER_STRUCT_KERNEL( l1_norm_grad, GPU, ALL_LAYOUT, ops::L1NormGradKernel, float) {} diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc index 197aaa74bb3e1..8a8a705b629bf 100644 --- a/paddle/fluid/operators/load_op.cc +++ b/paddle/fluid/operators/load_op.cc @@ -133,7 +133,7 @@ PD_REGISTER_KERNEL(load, CPU, ALL_LAYOUT, ops::LoadKernel, float) {} PD_REGISTER_KERNEL( load_sr, CPU, ALL_LAYOUT, ops::LoadSelectedRowsKernel, float) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA) PD_REGISTER_KERNEL(load, GPU, ALL_LAYOUT, ops::LoadKernel, float) {} PD_REGISTER_KERNEL( load_sr, GPU, ALL_LAYOUT, ops::LoadSelectedRowsKernel, float) {} diff --git a/paddle/fluid/operators/math/inclusive_scan.h b/paddle/fluid/operators/math/inclusive_scan.h index 3032b78a2029d..2718a0079ed19 100644 --- a/paddle/fluid/operators/math/inclusive_scan.h +++ b/paddle/fluid/operators/math/inclusive_scan.h @@ -17,6 +17,9 @@ #ifdef __NVCC__ #include "cub/cub.cuh" #endif +#ifdef __MUSACC__ +#include "cub/cub.cuh" +#endif #ifdef __HIPCC__ #include namespace cub = hipcub; diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc index 5208d0b2cf937..5394c755e56df 100644 --- a/paddle/fluid/operators/matmul_op.cc +++ b/paddle/fluid/operators/matmul_op.cc @@ -75,7 +75,7 @@ class MatMulKernel : public framework::OpKernel { int head_number = 1; #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) + !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) head_number = context.Attr("head_number"); #endif @@ -89,7 +89,7 @@ class MatMulKernel : public framework::OpKernel { } } #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) + !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) bool split_vertical_y = (mat_dim_a.width_ != mat_dim_b.height_); if (head_number > 1) { @@ -241,7 +241,7 @@ class MatMulGradKernel : public framework::OpKernel { int head_number = 1; #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) + !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) if (context.HasAttr("head_number")) { head_number = context.Attr("head_number"); } @@ -373,7 +373,7 @@ class MatMulDoubleGradKernel : public framework::OpKernel { int head_number = 1; #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) + !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) head_number = context.Attr("head_number"); #endif @@ -615,7 +615,7 @@ class MatMulOp : public framework::OperatorWithKernel { } int64_t dim_out_y = mat_dim_y.width_; #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) + !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) int head_number = context->Attrs().Get("head_number"); bool split_vertical_y = (mat_dim_x.width_ != mat_dim_y.height_); if (context->IsRuntime()) { @@ -758,7 +758,7 @@ class MatMulOpMaker : public framework::OpProtoAndCheckerMaker { .AsExtra(); #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) + !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) AddAttr("head_number", "The number of heads of the matrix") .SetDefault(1); #endif @@ -926,7 +926,7 @@ REGISTER_OP_CPU_KERNEL(matmul_grad_grad, ops::MatMulDoubleGradKernel, ops::MatMulDoubleGradKernel); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA) REGISTER_OP_CUDA_KERNEL( matmul, ops::MatMulKernel, diff --git a/paddle/fluid/operators/minus_op.cc b/paddle/fluid/operators/minus_op.cc index 27a38571e1c80..6d4960b22411b 100644 --- a/paddle/fluid/operators/minus_op.cc +++ b/paddle/fluid/operators/minus_op.cc @@ -157,6 +157,6 @@ REGISTER_OPERATOR(minus, ops::MinusGradMaker); PD_REGISTER_STRUCT_KERNEL(minus, CPU, ALL_LAYOUT, ops::MinusKernel, float) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA) PD_REGISTER_STRUCT_KERNEL(minus, GPU, ALL_LAYOUT, ops::MinusKernel, float) {} #endif diff --git a/paddle/fluid/operators/mudnn_rnn_cache.h b/paddle/fluid/operators/mudnn_rnn_cache.h new file mode 100644 index 0000000000000..af9ebd800fa3c --- /dev/null +++ b/paddle/fluid/operators/mudnn_rnn_cache.h @@ -0,0 +1,33 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" + +namespace paddle { +namespace operators { + +struct CudnnRNNCache { + CudnnRNNCache() { + } + ~CudnnRNNCache() {} +}; + +} // namespace operators +} // namespace paddle + diff --git a/paddle/fluid/operators/nop_op.cc b/paddle/fluid/operators/nop_op.cc index e99b3956d05b0..45d44e71b5775 100644 --- a/paddle/fluid/operators/nop_op.cc +++ b/paddle/fluid/operators/nop_op.cc @@ -60,6 +60,6 @@ REGISTER_OP_WITHOUT_GRADIENT(nop, ops::NopOp, ops::NopOpMaker); PD_REGISTER_STRUCT_KERNEL(nop, CPU, ALL_LAYOUT, ops::NopKernel, float) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA) PD_REGISTER_STRUCT_KERNEL(nop, GPU, ALL_LAYOUT, ops::NopKernel, float) {} #endif diff --git a/paddle/fluid/operators/optimizers/sparse_momentum_op.h b/paddle/fluid/operators/optimizers/sparse_momentum_op.h index f1b162be46610..1f3ae2f9e318e 100644 --- a/paddle/fluid/operators/optimizers/sparse_momentum_op.h +++ b/paddle/fluid/operators/optimizers/sparse_momentum_op.h @@ -28,6 +28,9 @@ #ifdef __NVCC__ #include "cub/cub.cuh" #endif +#ifdef __MUSACC__ +#include "cub/cub.cuh" +#endif #ifdef __HIPCC__ #include namespace cub = hipcub; @@ -460,7 +463,7 @@ class SparseMomentumOpKernel : public framework::OpKernel { grad_index.mutable_data({num_index}, ctx.GetPlace()); if (platform::is_gpu_place(ctx.GetPlace())) { -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) auto sort_value_ptr = sort_value.mutable_data({num_index}, ctx.GetPlace()); diff --git a/paddle/fluid/operators/pad_constant_like_op.cc b/paddle/fluid/operators/pad_constant_like_op.cc index 72061fbc39630..ea090c6cdb40a 100644 --- a/paddle/fluid/operators/pad_constant_like_op.cc +++ b/paddle/fluid/operators/pad_constant_like_op.cc @@ -260,7 +260,7 @@ PD_REGISTER_STRUCT_KERNEL(pad_constant_like_grad, int, int64_t) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA) PD_REGISTER_STRUCT_KERNEL(pad_constant_like, GPU, ALL_LAYOUT, diff --git a/paddle/fluid/operators/prroi_pool_op.h b/paddle/fluid/operators/prroi_pool_op.h index e2417a071ce88..a10f59f8a2fbe 100644 --- a/paddle/fluid/operators/prroi_pool_op.h +++ b/paddle/fluid/operators/prroi_pool_op.h @@ -17,7 +17,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/kernels/funcs/math_function.h" -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) #include "paddle/phi/backends/gpu/gpu_primitives.h" #endif @@ -85,7 +85,7 @@ inline HOSTDEVICE T PrRoIPoolingMatCalculation(const T* this_data, return sum_out; } -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) template DEVICE void PrRoIPoolingDistributeDiff(T* diff, const T top_diff, @@ -163,7 +163,7 @@ HOSTDEVICE void PrRoIPoolingMatDistributeDiff(T* diff, PrRoIPoolingDistributeDiff(diff, top_diff, e_h, e_w, h0, w0, tmp); } -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) template DEVICE void AccumulateRois(T* offset, T data) { phi::CudaAtomicAdd(offset, data); @@ -175,7 +175,7 @@ inline HOSTDEVICE void AccumulateRois(T* offset, T data) { } #endif -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) template DEVICE T MaxFunctor(const T x, const T y) { return max(x, y); diff --git a/paddle/fluid/operators/rank_loss_op.cc b/paddle/fluid/operators/rank_loss_op.cc index b9f05d663dba0..4d24896d37000 100644 --- a/paddle/fluid/operators/rank_loss_op.cc +++ b/paddle/fluid/operators/rank_loss_op.cc @@ -246,7 +246,7 @@ PD_REGISTER_STRUCT_KERNEL( PD_REGISTER_STRUCT_KERNEL( rank_loss_grad, CPU, ALL_LAYOUT, ops::RankLossGradKernel, float) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA) PD_REGISTER_STRUCT_KERNEL( rank_loss, GPU, ALL_LAYOUT, ops::RankLossKernel, float) {} PD_REGISTER_STRUCT_KERNEL( diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index a089ad7d58fac..1cd5ef11909a0 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -761,7 +761,7 @@ REGISTER_OPERATOR(reshape2_grad_grad, ops::ReshapeDoubleGradOpNoNeedBufferVarInferer, Reshape2DoubleGradInferShapeFunctor); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA) REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc index ab03d46486c2e..d5727d9eb9936 100644 --- a/paddle/fluid/operators/save_op.cc +++ b/paddle/fluid/operators/save_op.cc @@ -117,7 +117,7 @@ PD_REGISTER_KERNEL(save_sr, phi::dtype::float16, phi::dtype::bfloat16) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA) PD_REGISTER_KERNEL(save, GPU, ALL_LAYOUT, diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu index 897ff207f5eca..7411ecc05358c 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu @@ -18,6 +18,10 @@ limitations under the License. */ #include #endif +#ifdef __MUSACC__ +#include +#endif + #ifdef __HIPCC__ #include namespace cub = hipcub; diff --git a/paddle/fluid/operators/svd_helper.h b/paddle/fluid/operators/svd_helper.h index ccf5cd09a0842..e0004f197cd55 100644 --- a/paddle/fluid/operators/svd_helper.h +++ b/paddle/fluid/operators/svd_helper.h @@ -478,7 +478,7 @@ struct DeviceIndependenceTensorOperations { std::vector out_shape = GetBroadcastShape({&x, &y}); ret.Resize(phi::make_ddim(out_shape)); if (platform::is_gpu_place(context.GetPlace())) { -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) // For GPU, there is no need to define XxxInverseFunctor and call // ElementwiseComputeEx in two branches. ElementwiseComputeEx, DeviceContext, InT>( diff --git a/paddle/fluid/operators/sync_batch_norm_utils.h b/paddle/fluid/operators/sync_batch_norm_utils.h index 7c14f6dfac324..ebc825b66a5ef 100644 --- a/paddle/fluid/operators/sync_batch_norm_utils.h +++ b/paddle/fluid/operators/sync_batch_norm_utils.h @@ -22,6 +22,9 @@ limitations under the License. */ #ifdef __NVCC__ #include "cub/cub.cuh" #endif +#ifdef __MUSACC__ +#include "cub/cub.cuh" +#endif #ifdef __HIPCC__ #include namespace cub = hipcub; diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu index f1674bc5005a0..fede7fe5156d0 100644 --- a/paddle/fluid/operators/top_k_op.cu +++ b/paddle/fluid/operators/top_k_op.cu @@ -18,6 +18,9 @@ limitations under the License. */ #ifdef __NVCC__ #include "cub/cub.cuh" #endif +#ifdef __MUSACC__ +#include "cub/cub.cuh" +#endif #ifdef __HIPCC__ #include #endif diff --git a/paddle/fluid/operators/uniform_random_op.h b/paddle/fluid/operators/uniform_random_op.h index 16bce515f2a7f..12725c397faf6 100644 --- a/paddle/fluid/operators/uniform_random_op.h +++ b/paddle/fluid/operators/uniform_random_op.h @@ -19,7 +19,7 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) #include #include "paddle/phi/core/generator.h" @@ -113,7 +113,7 @@ inline std::vector GetNewDataFromShapeTensorList( return vec_new_shape; } -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) template struct UniformGenerator { diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 4d7f496aaa42d..527e7396fa488 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -64,7 +64,7 @@ if(WITH_DGC) set(dgc_deps dgc) endif() -if(WITH_GPU OR WITH_ROCM) +if(WITH_GPU OR WITH_ROCM OR WITH_MUSA) set(GPU_CTX_DEPS dynload_cuda dynamic_loader) endif() @@ -90,8 +90,15 @@ if(WITH_ROCM) SRCS stream_callback_manager.cc DEPS simple_threadpool enforce) endif() +if(WITH_MUSA) + musa_library( + stream_callback_manager + SRCS stream_callback_manager.cc + DEPS simple_threadpool enforce) +endif() + -if(WITH_GPU OR WITH_ROCM) +if(WITH_GPU OR WITH_ROCM OR WITH_MUSA) set(STREAM_CALLBACK_DEPS stream_callback_manager) else() set(STREAM_CALLBACK_DEPS) @@ -137,7 +144,7 @@ cc_library( SRCS collective_helper.cc gen_comm_id_helper.cc DEPS framework_proto device_context enforce) -if(WITH_GPU OR WITH_ROCM) +if(WITH_GPU OR WITH_ROCM OR WITH_MUSA) target_link_libraries(device_context gpu_resource_pool) endif() @@ -235,6 +242,16 @@ if(WITH_ROCM) DEPS device_context gpu_info) endif() +if(WITH_MUSA) + musa_library( + device_event_gpu + SRCS device_event_gpu.cc + DEPS device_event_base) + set(DEVICE_EVENT_LIBS + device_event_gpu + CACHE INTERNAL "device event libs") +endif() + cc_library(timer SRCS timer.cc) cc_test( timer_test @@ -339,6 +356,13 @@ if(WITH_GPU) DEPS gpu_info) endif() +if(WITH_MUSA) + musa_library( + cuda_device_guard + SRCS cuda_device_guard.cc + DEPS gpu_info) +endif() + if(WITH_ROCM) hip_test( float16_gpu_test diff --git a/paddle/fluid/platform/device/CMakeLists.txt b/paddle/fluid/platform/device/CMakeLists.txt index 6f0d86f0a4b17..b782a45047117 100644 --- a/paddle/fluid/platform/device/CMakeLists.txt +++ b/paddle/fluid/platform/device/CMakeLists.txt @@ -1,7 +1,7 @@ set(DEV_LIBS custom_device) # GPU -if(WITH_GPU OR WITH_ROCM) +if(WITH_GPU OR WITH_ROCM OR WITH_MUSA) add_subdirectory(gpu) endif() diff --git a/paddle/fluid/platform/device/gpu/CMakeLists.txt b/paddle/fluid/platform/device/gpu/CMakeLists.txt index 897f8d3732b73..f992901a46fd5 100644 --- a/paddle/fluid/platform/device/gpu/CMakeLists.txt +++ b/paddle/fluid/platform/device/gpu/CMakeLists.txt @@ -22,6 +22,12 @@ elseif(WITH_ROCM) cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda) +elseif(WITH_MUSA) + musa_library( + gpu_info + SRCS gpu_info.cc + DEPS phi glog enforce monitor dynload_cuda) + endif() cc_library( diff --git a/paddle/fluid/platform/device/gpu/gpu_dnn.h b/paddle/fluid/platform/device/gpu/gpu_dnn.h index 2a9db61f83bc6..f6f6392c4c23d 100644 --- a/paddle/fluid/platform/device/gpu/gpu_dnn.h +++ b/paddle/fluid/platform/device/gpu/gpu_dnn.h @@ -16,7 +16,7 @@ #include "paddle/phi/backends/gpu/gpu_dnn.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) namespace paddle { namespace platform { diff --git a/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc b/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc index d8e9197bf6ea5..0fb7e061e3243 100644 --- a/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc +++ b/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc @@ -102,7 +102,7 @@ CudaEventResourcePool::CudaEventResourcePool() { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event)); #elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event)); + PADDLE_ENFORCE_GPU_SUCCESS(musaEventDestroy(event)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(event)); #endif diff --git a/paddle/fluid/platform/device/gpu/gpu_types.h b/paddle/fluid/platform/device/gpu/gpu_types.h index ba7b1ede735fe..060a9161c46ad 100644 --- a/paddle/fluid/platform/device/gpu/gpu_types.h +++ b/paddle/fluid/platform/device/gpu/gpu_types.h @@ -26,6 +26,8 @@ #elif defined(PADDLE_WITH_MUSA) #include #include +#include +using mudnnHandle_t = ::musa::dnn::Handle*; //TODO(Xiaokang Shang) #else #include @@ -91,9 +93,10 @@ DECLARE_TYPE_FOR_GPU(dnnPoolingMode_t, cudnnPoolingMode_t, miopenPoolingMode_t); DECLARE_TYPE_FOR_GPU(dnnDropoutDescriptor_t, cudnnDropoutDescriptor_t, miopenDropoutDescriptor_t); -DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t, mudnnHandle_t); #endif +DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle, mublasHandle_t); // TODO(MTAI) +DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t, mudnnHandle_t); DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle, mublasHandle_t); // TODO(Ming Huang): Since there is no blasLt handler, diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt index 4cb3bfdb3adae..beac4eb9261a0 100644 --- a/paddle/fluid/platform/dynload/CMakeLists.txt +++ b/paddle/fluid/platform/dynload/CMakeLists.txt @@ -39,6 +39,9 @@ if(NOT APPLE) list(APPEND HIP_SRCS cupti.cc) endif() endif() + if(WITH_MUSA) + list(APPEND MUSA_SRCS musa_driver.cc musartc.cc) + endif() endif() if(TENSORRT_FOUND) @@ -62,6 +65,15 @@ if(WITH_ROCM) dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc phi) +elseif(WITH_MUSA) + musa_library( + dynload_cuda + SRCS ${MUSA_SRCS} + DEPS dynamic_loader phi) + cc_library( + dynload_warpctc + SRCS warpctc.cc + DEPS dynamic_loader warpctc phi) else() nv_library( dynload_cuda diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index d1b557922af32..c23abcee9d725 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -698,7 +698,7 @@ void EnableProfiler(ProfilerState state) { HostTraceLevel::GetInstance().SetLevel(option.trace_level); should_send_profile_state = true; phi::GetDeviceTracer()->Enable(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA) if (phi::ProfilerHelper::g_state == ProfilerState::kCUDA || phi::ProfilerHelper::g_state == ProfilerState::kAll || phi::ProfilerHelper::g_state == ProfilerState::kCPU) { diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc index c55bcb71a7d43..b5f593193bfc2 100644 --- a/paddle/fluid/platform/stream_callback_manager.cc +++ b/paddle/fluid/platform/stream_callback_manager.cc @@ -24,6 +24,11 @@ static void StreamCallbackFunc(gpuStream_t stream, gpuError_t status, void *user_data) #endif +#ifdef PADDLE_WITH_MUSA +static void StreamCallbackFunc(gpuStream_t stream, + gpuError_t status, + void *user_data) +#endif #ifdef PADDLE_WITH_CUDA #if CUDA_VERSION >= 10000 static void CUDART_CB StreamCallbackFunc(void *user_data) @@ -58,6 +63,11 @@ void StreamCallbackManager::AddCallback( PADDLE_ENFORCE_GPU_SUCCESS( hipStreamAddCallback(stream_, StreamCallbackFunc, func, 0)); #endif +#ifdef PADDLE_WITH_MUSA + PADDLE_ENFORCE_GPU_SUCCESS( + musaStreamAddCallback(stream_, StreamCallbackFunc, func, 0)); + //musaLaunchHostFunc(stream_, StreamCallbackFunc, func)); +#endif #ifdef PADDLE_WITH_CUDA #if CUDA_VERSION >= 10000 PADDLE_ENFORCE_GPU_SUCCESS( @@ -71,7 +81,7 @@ void StreamCallbackManager::AddCallback( template void StreamCallbackManager::Wait() const { -#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_CUDA) +#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_MUSA) platform::GpuStreamSync(stream_); #endif { @@ -85,6 +95,9 @@ void StreamCallbackManager::Wait() const { #ifdef PADDLE_WITH_CUDA template struct StreamCallbackManager; #endif +#ifdef PADDLE_WITH_MUSA +template struct StreamCallbackManager; +#endif #ifdef PADDLE_WITH_HIP template struct StreamCallbackManager; #endif diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt index 593109d3e8e27..139642f5b6b65 100644 --- a/paddle/phi/CMakeLists.txt +++ b/paddle/phi/CMakeLists.txt @@ -51,6 +51,15 @@ if(WITH_GPU) list(APPEND PHI_DEPS external_error_proto) endif() +if(WITH_MUSA) + set(DEPENDENT_LIBRARIES "") + list(APPEND DEPENDENT_LIBRARIES "/usr/local/musa/lib/libmudnn.so") + list(APPEND DEPENDENT_LIBRARIES "/usr/local/musa/lib/libmccl.so") + list(APPEND DEPENDENT_LIBRARIES "/usr/local/musa/lib/libmusart.so") + list(APPEND DEPENDENT_LIBRARIES "/usr/local/musa/lib/libmublas.so") + list(APPEND PHI_DEPS ${DEPENDENT_LIBRARIES}) +endif() + if(WITH_ASCEND_CL) list(APPEND PHI_DEPS npu_hccl) endif() diff --git a/paddle/phi/backends/device_code.cc b/paddle/phi/backends/device_code.cc index 33b8f3a320aac..97279e2d0f76c 100644 --- a/paddle/phi/backends/device_code.cc +++ b/paddle/phi/backends/device_code.cc @@ -80,7 +80,7 @@ DeviceCodePool::DeviceCodePool(const std::vector& places) { } for (auto& p : set) { if (p.GetType() == phi::AllocationType::GPU) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA) device_codes_.emplace(p, DeviceCodeMap()); #else PADDLE_THROW(phi::errors::PreconditionNotMet( @@ -90,40 +90,40 @@ DeviceCodePool::DeviceCodePool(const std::vector& places) { } } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA) GPUDeviceCode::CheckAvailableStatus(); #endif } #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) -#ifdef PADDLE_WITH_HIP -static bool CheckCUDADriverResult(hipError_t result, - std::string caller, - std::string kernel_name = "") { - if (result != hipSuccess) { - const char* error = nullptr; - error = dynload::hipGetErrorString(result); -#elif defined(PADDLE_WITH_MUSA) -static bool CheckCUDADriverResult(MUresult result, - std::string caller, - std::string kernel_name = "") { - if (result != MUSA_SUCCESS) { - const char* error = nullptr; - muGetErrorString(result, &error); -#else -static bool CheckCUDADriverResult(CUresult result, - std::string caller, - std::string kernel_name = "") { - if (result != CUDA_SUCCESS) { - const char* error = nullptr; - dynload::cuGetErrorString(result, &error); -#endif - LOG_FIRST_N(WARNING, 1) << "Call " << caller << " for < " << kernel_name - << " > failed: " << error << " (" << result << ")"; - return false; - } - return true; -} +//#ifdef PADDLE_WITH_HIP +//static bool CheckCUDADriverResult(hipError_t result, +// std::string caller, +// std::string kernel_name = "") { +// if (result != hipSuccess) { +// const char* error = nullptr; +// error = dynload::hipGetErrorString(result); +//#elif defined(PADDLE_WITH_MUSA) +////static bool CheckCUDADriverResult(MUresult result, +//// std::string caller, +//// std::string kernel_name = "") { +//// if (result != MUSA_SUCCESS) { +//// const char* error = nullptr; +//// muGetErrorString(result, &error); +//#else +//static bool CheckCUDADriverResult(CUresult result, +// std::string caller, +// std::string kernel_name = "") { +// if (result != CUDA_SUCCESS) { +// const char* error = nullptr; +// dynload::cuGetErrorString(result, &error); +//#endif +// LOG_FIRST_N(WARNING, 1) << "Call " << caller << " for < " << kernel_name +// << " > failed: " << error << " (" << result << ")"; +// return false; +// } +// return true; +//} bool GPUDeviceCode::available_ = false; void GPUDeviceCode::CheckAvailableStatus() { @@ -148,19 +148,19 @@ void GPUDeviceCode::CheckAvailableStatus() { int driver_version = 0; int dirver_major = 0; int driver_minor = 0; -#ifdef PADDLE_WITH_HIP - hipError_t driver_result = dynload::hipDriverGetVersion(&driver_version); - if (driver_result == hipSuccess) { -#elif defined(PADDLE_WITH_MUSA) - MUresult driver_result = muDriverGetVersion(&driver_version); - if (driver_result == MUSA_SUCCESS) { -#else - CUresult driver_result = dynload::cuDriverGetVersion(&driver_version); - if (driver_result == CUDA_SUCCESS) { -#endif - dirver_major = driver_version / 1000; - driver_minor = (driver_version % 1000) / 10; - } +//#ifdef PADDLE_WITH_HIP +// hipError_t driver_result = dynload::hipDriverGetVersion(&driver_version); +// if (driver_result == hipSuccess) { +//#elif defined(PADDLE_WITH_MUSA) +// MUresult driver_result = muDriverGetVersion(&driver_version); +// if (driver_result == MUSA_SUCCESS) { +//#else +// CUresult driver_result = dynload::cuDriverGetVersion(&driver_version); +// if (driver_result == CUDA_SUCCESS) { +//#endif +// dirver_major = driver_version / 1000; +// driver_minor = (driver_version % 1000) / 10; +// } LOG_FIRST_N(INFO, 1) << "CUDA Driver Version: " << dirver_major << "." << driver_minor << "; NVRTC Version: " << nvrtc_major @@ -176,18 +176,18 @@ void GPUDeviceCode::CheckAvailableStatus() { } int count = 0; -#ifdef PADDLE_WITH_HIP - if (CheckCUDADriverResult(dynload::hipGetDeviceCount(&count), - "hipGetDeviceCount")) { -#elif defined(PADDLE_WITH_MUSA) - if (CheckCUDADriverResult(muDeviceGetCount(&count), - "muDeviceGetCount")) { -#else - if (CheckCUDADriverResult(dynload::cuDeviceGetCount(&count), - "cuDeviceGetCount")) { -#endif - available_ = true; - } +//#ifdef PADDLE_WITH_HIP +// if (CheckCUDADriverResult(dynload::hipGetDeviceCount(&count), +// "hipGetDeviceCount")) { +//#elif defined(PADDLE_WITH_MUSA) +// if (CheckCUDADriverResult(muDeviceGetCount(&count), +// "muDeviceGetCount")) { +//#else +// if (CheckCUDADriverResult(dynload::cuDeviceGetCount(&count), +// "cuDeviceGetCount")) { +//#endif +// available_ = true; +// } } static std::string FindCUDAIncludePath() { @@ -465,21 +465,21 @@ void GPUDeviceCode::Launch(const size_t n, std::vector* args) const { errors::External("Fail to launch kernel %s (in hipModuleLaunchKernel.)", name_.c_str())); #elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_EQ( - muLaunchKernel(function_, - num_blocks, - 1, - 1, // grid dim - num_threads_, - 1, - 1, // block dim - 0, // shared memory - dev_ctx->stream(), // stream - args->data(), // arguments - nullptr), - MUSA_SUCCESS, - errors::External("Fail to launch kernel %s (in muLaunchKernel.)", - name_.c_str())); + //PADDLE_ENFORCE_EQ( + // muLaunchKernel(function_, + // num_blocks, + // 1, + // 1, // grid dim + // num_threads_, + // 1, + // 1, // block dim + // 0, // shared memory + // dev_ctx->stream(), // stream + // args->data(), // arguments + // nullptr), + // MUSA_SUCCESS, + // errors::External("Fail to launch kernel %s (in muLaunchKernel.)", + // name_.c_str())); #else PADDLE_ENFORCE_EQ( dynload::cuLaunchKernel(function_, diff --git a/paddle/phi/backends/dynload/CMakeLists.txt b/paddle/phi/backends/dynload/CMakeLists.txt index 838b623ae7b38..883e95c41985f 100644 --- a/paddle/phi/backends/dynload/CMakeLists.txt +++ b/paddle/phi/backends/dynload/CMakeLists.txt @@ -30,6 +30,13 @@ if(WITH_ROCM) rocsparse.cc) endif() +if(WITH_MUSA) + list( + APPEND + MUSA_SRCS + mudnn.cc) +endif() + # There is no macOS version of NCCL. # Disable nvrtc and cuda_driver api on macOS, and only do an early test on Linux and Windows. if(NOT APPLE) @@ -46,6 +53,9 @@ if(NOT APPLE) list(APPEND HIP_SRCS cupti.cc) endif() endif() + if(WITH_MUSA) + list(APPEND MUSA_SRCS musartc.cc musa_driver.cc) + endif() endif() if(TENSORRT_FOUND) @@ -93,6 +103,8 @@ if(WITH_ROCM) collect_srcs(backends_srcs SRCS ${DYNLOAD_COMMON_SRCS} ${HIP_SRCS}) elseif(WITH_GPU) collect_srcs(backends_srcs SRCS ${DYNLOAD_COMMON_SRCS} ${CUDA_SRCS}) +elseif(WITH_MUSA) + collect_srcs(backends_srcs SRCS ${DYNLOAD_COMMON_SRCS} ${MUSA_SRCS}) else() collect_srcs(backends_srcs SRCS ${DYNLOAD_COMMON_SRCS}) endif() diff --git a/paddle/phi/backends/gpu/gpu_decls.h b/paddle/phi/backends/gpu/gpu_decls.h index d6f42ff743e58..4020e811f7aca 100644 --- a/paddle/phi/backends/gpu/gpu_decls.h +++ b/paddle/phi/backends/gpu/gpu_decls.h @@ -24,6 +24,7 @@ namespace phi { using GPU_TYPE = ROCM_TYPE; #elif defined(PADDLE_WITH_MUSA) + //using mudnnHandle_t = ::musa::dnn::Handle; using mudnnHandle_t = bool**; using mublasLtHandle_t = bool**; using musparseHandle_t = bool**; diff --git a/paddle/phi/backends/gpu/gpu_primitives.h b/paddle/phi/backends/gpu/gpu_primitives.h index b7c9f9c4688dc..b891644679264 100644 --- a/paddle/phi/backends/gpu/gpu_primitives.h +++ b/paddle/phi/backends/gpu/gpu_primitives.h @@ -61,7 +61,7 @@ CUDA_ATOMIC_WRAPPER(Add, int64_t) { static_cast(val)); // NOLINT } -#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) +#if defined(__HIPCC__) || defined(__MUSACC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) USE_CUDA_ATOMIC(Add, double); #else CUDA_ATOMIC_WRAPPER(Add, double) { @@ -231,21 +231,23 @@ __device__ __forceinline__ void fastAtomicAdd(T *arr, // NOTE(zhangbo): cuda do not have atomicCAS for __nv_bfloat16. inline static __device__ uint32_t bf16_add_to_low_half(uint32_t val, float x) { - phi::dtype::bfloat16 low_half; - // the bfloat16 in lower 16bits - low_half.x = static_cast(val & 0xFFFFu); - low_half = - static_cast(static_cast(low_half) + x); - return (val & 0xFFFF0000u) | low_half.x; + return 0; + //phi::dtype::bfloat16 low_half; + //// the bfloat16 in lower 16bits + //low_half.x = static_cast(val & 0xFFFFu); + //low_half = + // static_cast(static_cast(low_half) + x); + //return (val & 0xFFFF0000u) | low_half.x; } inline static __device__ uint32_t bf16_add_to_high_half(uint32_t val, float x) { - phi::dtype::bfloat16 high_half; - // the bfloat16 in higher 16bits - high_half.x = static_cast(val >> 16); - high_half = - static_cast(static_cast(high_half) + x); - return (val & 0xFFFFu) | (static_cast(high_half.x) << 16); + return 0; + //phi::dtype::bfloat16 high_half; + //// the bfloat16 in higher 16bits + //high_half.x = static_cast(val >> 16); + //high_half = + // static_cast(static_cast(high_half) + x); + //return (val & 0xFFFFu) | (static_cast(high_half.x) << 16); } #if CUDA_VERSION >= 11000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 @@ -264,54 +266,54 @@ CUDA_ATOMIC_WRAPPER(Add, phi::dtype::bfloat16) { PDBF16ToCUDABF16(val))); } #else -CUDA_ATOMIC_WRAPPER(Add, phi::dtype::bfloat16) { - // concrete packed bfloat16 value may exsits in lower or higher 16bits - // of the 32bits address. - uint32_t *address_as_ui = reinterpret_cast( - reinterpret_cast(address) - - (reinterpret_cast(address) & 0x02)); - float val_f = static_cast(val); - uint32_t old = *address_as_ui; - uint32_t sum; - uint32_t newval; - uint32_t assumed; - if (((uintptr_t)address & 0x02) == 0) { - // the bfloat16 value stay at lower 16 bits of the address. - do { - assumed = old; - old = atomicCAS( - address_as_ui, assumed, bf16_add_to_low_half(assumed, val_f)); - } while (old != assumed); - phi::dtype::bfloat16 ret; - ret.x = old & 0xFFFFu; - return ret; - } else { - // the bfloat16 value stay at higher 16 bits of the address. - do { - assumed = old; - old = atomicCAS( - address_as_ui, assumed, bf16_add_to_high_half(assumed, val_f)); - } while (old != assumed); - phi::dtype::bfloat16 ret; - ret.x = old >> 16; - return ret; - } -} +//CUDA_ATOMIC_WRAPPER(Add, phi::dtype::bfloat16) { +// // concrete packed bfloat16 value may exsits in lower or higher 16bits +// // of the 32bits address. +// uint32_t *address_as_ui = reinterpret_cast( +// reinterpret_cast(address) - +// (reinterpret_cast(address) & 0x02)); +// float val_f = static_cast(val); +// uint32_t old = *address_as_ui; +// uint32_t sum; +// uint32_t newval; +// uint32_t assumed; +// if (((uintptr_t)address & 0x02) == 0) { +// // the bfloat16 value stay at lower 16 bits of the address. +// do { +// assumed = old; +// old = atomicCAS( +// address_as_ui, assumed, bf16_add_to_low_half(assumed, val_f)); +// } while (old != assumed); +// phi::dtype::bfloat16 ret; +// ret.x = old & 0xFFFFu; +// return ret; +// } else { +// // the bfloat16 value stay at higher 16 bits of the address. +// do { +// assumed = old; +// old = atomicCAS( +// address_as_ui, assumed, bf16_add_to_high_half(assumed, val_f)); +// } while (old != assumed); +// phi::dtype::bfloat16 ret; +// ret.x = old >> 16; +// return ret; +// } +//} #endif -CUDA_ATOMIC_WRAPPER(Add, complex) { - float *real = reinterpret_cast(address); - float *imag = real + 1; - return complex(CudaAtomicAdd(real, val.real), - CudaAtomicAdd(imag, val.imag)); -} - -CUDA_ATOMIC_WRAPPER(Add, complex) { - double *real = reinterpret_cast(address); - double *imag = real + 1; - return complex(CudaAtomicAdd(real, val.real), - CudaAtomicAdd(imag, val.imag)); -} +//CUDA_ATOMIC_WRAPPER(Add, complex) { +// float *real = reinterpret_cast(address); +// float *imag = real + 1; +// return complex(CudaAtomicAdd(real, val.real), +// CudaAtomicAdd(imag, val.imag)); +//} +// +//CUDA_ATOMIC_WRAPPER(Add, complex) { +// double *real = reinterpret_cast(address); +// double *imag = real + 1; +// return complex(CudaAtomicAdd(real, val.real), +// CudaAtomicAdd(imag, val.imag)); +//} // For atomicMax USE_CUDA_ATOMIC(Max, int); @@ -449,55 +451,57 @@ CUDA_ATOMIC_WRAPPER(Max, phi::dtype::float16) { #endif inline static __device__ uint32_t bf16_max_to_low_half(uint32_t val, float x) { - phi::dtype::bfloat16 low_half; - // The bfloat16 in lower 16bits - low_half.x = static_cast(val & 0xFFFFu); - low_half = - static_cast(max(static_cast(low_half), x)); - return (val & 0xFFFF0000u) | low_half.x; + return 0; + //phi::dtype::bfloat16 low_half; + //// The bfloat16 in lower 16bits + //low_half.x = static_cast(val & 0xFFFFu); + //low_half = + // static_cast(max(static_cast(low_half), x)); + //return (val & 0xFFFF0000u) | low_half.x; } inline static __device__ uint32_t bf16_max_to_high_half(uint32_t val, float x) { - phi::dtype::bfloat16 high_half; - // The bfloat16 in higher 16bits - high_half.x = static_cast(val >> 16); - high_half = - static_cast(max(static_cast(high_half), x)); - return (val & 0xFFFFu) | (static_cast(high_half.x) << 16); -} - -CUDA_ATOMIC_WRAPPER(Max, phi::dtype::bfloat16) { - if (*address >= val) { - return *address; - } - uint32_t *address_as_ui = reinterpret_cast( - reinterpret_cast(address) - - (reinterpret_cast(address) & 0x02)); - float val_f = static_cast(val); - uint32_t old = *address_as_ui; - uint32_t assumed; - if (((uintptr_t)address & 0x02) == 0) { - // The bfloat16 value stay at lower 16 bits of the address. - do { - assumed = old; - old = atomicCAS( - address_as_ui, assumed, bf16_max_to_low_half(assumed, val_f)); - } while (old != assumed); - phi::dtype::bfloat16 ret; - ret.x = old & 0xFFFFu; - return ret; - } else { - // The bfloat16 value stay at higher 16 bits of the address. - do { - assumed = old; - old = atomicCAS( - address_as_ui, assumed, bf16_max_to_high_half(assumed, val_f)); - } while (old != assumed); - phi::dtype::bfloat16 ret; - ret.x = old >> 16; - return ret; - } -} + return 0; + //phi::dtype::bfloat16 high_half; + //// The bfloat16 in higher 16bits + //high_half.x = static_cast(val >> 16); + //high_half = + // static_cast(max(static_cast(high_half), x)); + //return (val & 0xFFFFu) | (static_cast(high_half.x) << 16); +} + +//CUDA_ATOMIC_WRAPPER(Max, phi::dtype::bfloat16) { +// if (*address >= val) { +// return *address; +// } +// uint32_t *address_as_ui = reinterpret_cast( +// reinterpret_cast(address) - +// (reinterpret_cast(address) & 0x02)); +// float val_f = static_cast(val); +// uint32_t old = *address_as_ui; +// uint32_t assumed; +// if (((uintptr_t)address & 0x02) == 0) { +// // The bfloat16 value stay at lower 16 bits of the address. +// do { +// assumed = old; +// old = atomicCAS( +// address_as_ui, assumed, bf16_max_to_low_half(assumed, val_f)); +// } while (old != assumed); +// phi::dtype::bfloat16 ret; +// ret.x = old & 0xFFFFu; +// return ret; +// } else { +// // The bfloat16 value stay at higher 16 bits of the address. +// do { +// assumed = old; +// old = atomicCAS( +// address_as_ui, assumed, bf16_max_to_high_half(assumed, val_f)); +// } while (old != assumed); +// phi::dtype::bfloat16 ret; +// ret.x = old >> 16; +// return ret; +// } +//} // For atomicMin USE_CUDA_ATOMIC(Min, int); @@ -635,55 +639,57 @@ CUDA_ATOMIC_WRAPPER(Min, phi::dtype::float16) { #endif inline static __device__ uint32_t bf16_min_to_low_half(uint32_t val, float x) { - phi::dtype::bfloat16 low_half; - // The bfloat16 in lower 16bits - low_half.x = static_cast(val & 0xFFFFu); - low_half = - static_cast(min(static_cast(low_half), x)); - return (val & 0xFFFF0000u) | low_half.x; + return 0; + //phi::dtype::bfloat16 low_half; + //// The bfloat16 in lower 16bits + //low_half.x = static_cast(val & 0xFFFFu); + //low_half = + // static_cast(min(static_cast(low_half), x)); + //return (val & 0xFFFF0000u) | low_half.x; } inline static __device__ uint32_t bf16_min_to_high_half(uint32_t val, float x) { - phi::dtype::bfloat16 high_half; - // The bfloat16 in higher 16bits - high_half.x = static_cast(val >> 16); - high_half = - static_cast(min(static_cast(high_half), x)); - return (val & 0xFFFFu) | (static_cast(high_half.x) << 16); -} - -CUDA_ATOMIC_WRAPPER(Min, phi::dtype::bfloat16) { - if (*address <= val) { - return *address; - } - uint32_t *address_as_ui = reinterpret_cast( - reinterpret_cast(address) - - (reinterpret_cast(address) & 0x02)); - float val_f = static_cast(val); - uint32_t old = *address_as_ui; - uint32_t assumed; - if (((uintptr_t)address & 0x02) == 0) { - // The bfloat16 value stay at lower 16 bits of the address. - do { - assumed = old; - old = atomicCAS( - address_as_ui, assumed, bf16_min_to_low_half(assumed, val_f)); - } while (old != assumed); - phi::dtype::bfloat16 ret; - ret.x = old & 0xFFFFu; - return ret; - } else { - // The bfloat16 value stay at higher 16 bits of the address. - do { - assumed = old; - old = atomicCAS( - address_as_ui, assumed, bf16_min_to_high_half(assumed, val_f)); - } while (old != assumed); - phi::dtype::bfloat16 ret; - ret.x = old >> 16; - return ret; - } -} + return 0; + //phi::dtype::bfloat16 high_half; + //// The bfloat16 in higher 16bits + //high_half.x = static_cast(val >> 16); + //high_half = + // static_cast(min(static_cast(high_half), x)); + //return (val & 0xFFFFu) | (static_cast(high_half.x) << 16); +} + +//CUDA_ATOMIC_WRAPPER(Min, phi::dtype::bfloat16) { +// if (*address <= val) { +// return *address; +// } +// uint32_t *address_as_ui = reinterpret_cast( +// reinterpret_cast(address) - +// (reinterpret_cast(address) & 0x02)); +// float val_f = static_cast(val); +// uint32_t old = *address_as_ui; +// uint32_t assumed; +// if (((uintptr_t)address & 0x02) == 0) { +// // The bfloat16 value stay at lower 16 bits of the address. +// do { +// assumed = old; +// old = atomicCAS( +// address_as_ui, assumed, bf16_min_to_low_half(assumed, val_f)); +// } while (old != assumed); +// phi::dtype::bfloat16 ret; +// ret.x = old & 0xFFFFu; +// return ret; +// } else { +// // The bfloat16 value stay at higher 16 bits of the address. +// do { +// assumed = old; +// old = atomicCAS( +// address_as_ui, assumed, bf16_min_to_high_half(assumed, val_f)); +// } while (old != assumed); +// phi::dtype::bfloat16 ret; +// ret.x = old >> 16; +// return ret; +// } +//} #ifdef PADDLE_WITH_CUDA /* diff --git a/paddle/phi/backends/gpu/musa/.musa_info.cc.swp b/paddle/phi/backends/gpu/musa/.musa_info.cc.swp new file mode 100644 index 0000000000000000000000000000000000000000..6af992f38b98918be6e4adade228cbf08d2c333a GIT binary patch literal 4096 zcmYc?2=nw+u+TGP00IF9hP(?0Qg*p}Fr5ryU?|GZF98V?0*036W|rn@=7uR|iTKp& zXXNLm>K7!Yq~xT==av>HLfQHS8JYS?iOJcic`3#E=>?_wU`Y@WpP84Iua}&R&)8A9 z(GVC70ir^Hm%-S`&;X=VSxHerSSXYzU89OdLtr!nMnhmU1V%$(Gz3ONU^E0qLtr!n IhDQhh0Gzrb@&Et; literal 0 HcmV?d00001 diff --git a/paddle/phi/backends/gpu/musa/musa_device_function.h b/paddle/phi/backends/gpu/musa/musa_device_function.h new file mode 100644 index 0000000000000..f6131fb1e53d6 --- /dev/null +++ b/paddle/phi/backends/gpu/musa/musa_device_function.h @@ -0,0 +1,190 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +// NOTE(): support float16 to half in header file. +#define PADDLE_MUSA_FP16 +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/common/complex.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/enforce.h" + +namespace phi { +namespace backends { +namespace gpu { + +#define FULL_WARP_MASK 0xFFFFFFFF +#define CREATE_SHFL_MASK(mask, predicate) \ + mask = __ballot_sync(FULL_WARP_MASK, (predicate)) + +#define CUDA_LAUNCH_KERNEL_BASE(dim, ...) \ + case (dim): { \ + constexpr auto kPowerOfTwoDim = (dim); \ + __VA_ARGS__; \ + } break + +#define CUDA_LAUNCH_KERNEL_HELPER(...) \ + CUDA_LAUNCH_KERNEL_BASE(1024, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(512, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(256, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(128, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(64, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(32, ##__VA_ARGS__); + +template +__forceinline__ __device__ T +CudaShuffleDownSync(unsigned mask, T val, int delta, int width = warpSize) { + return __shfl_down_sync(mask, val, static_cast(delta), width); +} + +template +__forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, + T val, + int width = warpSize) { + return __shfl_xor_sync(mask, val, width); +} + +template <> +__forceinline__ __device__ phi::dtype::float16 CudaShuffleDownSync( + unsigned mask, phi::dtype::float16 val, int delta, int width) { + return phi::dtype::float16(__shfl_down_sync( + mask, val.to_half(), static_cast(delta), width)); +} + +template <> +__forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleDownSync( + unsigned mask, phi::dtype::bfloat16 val, int delta, int width) { +#if defined(PADDLE_MUSA_BF16) + return phi::dtype::bfloat16(__shfl_down_sync( + mask, val.to_mt_bfloat16(), static_cast(delta), width)); +#else + PADDLE_ENFORCE( + false, "__shfl_down_sync with bfloat16 is not supported on cuda <= 11."); +#endif +} + +template <> +__forceinline__ __device__ phi::dtype::complex CudaShuffleDownSync( + unsigned mask, phi::dtype::complex val, int delta, int width) { + float real = static_cast(__shfl_down_sync( + mask, static_cast(val.real), static_cast(delta), width)); + float imag = static_cast(__shfl_down_sync( + mask, static_cast(val.imag), static_cast(delta), width)); + return phi::dtype::complex(real, imag); +} + +template <> +__forceinline__ __device__ phi::dtype::complex CudaShuffleDownSync( + unsigned mask, phi::dtype::complex val, int delta, int width) { + double real = + static_cast(__shfl_down_sync(mask, + static_cast(val.real), + static_cast(delta), + width)); + double imag = + static_cast(__shfl_down_sync(mask, + static_cast(val.imag), + static_cast(delta), + width)); + return phi::dtype::complex(real, imag); +} +#if 0 +//template <> +//__forceinline__ __device__ phi::dtype::float16 CudaShuffleXorSync( +// unsigned mask, phi::dtype::float16 val, int width) { +// return phi::dtype::float16(__shfl_xor_sync(mask, val.to_half(), width)); +//} + +template <> +__forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleXorSync( + unsigned mask, phi::dtype::bfloat16 val, int width) { +#if defined(PADDLE_MUSA_BF16) + return phi::dtype::bfloat16( + __shfl_xor_sync(mask, val.to_nv_bfloat16(), width)); +#else + PADDLE_ENFORCE( + false, "__shfl_xor_sync with bfloat16 is not supported on cuda <= 11."); +#endif +} + +template <> +__forceinline__ __device__ phi::dtype::complex CudaShuffleXorSync( + unsigned mask, phi::dtype::complex val, int width) { + float real = static_cast( + __shfl_xor_sync(mask, static_cast(val.real), width)); + float imag = static_cast( + __shfl_xor_sync(mask, static_cast(val.imag), width)); + return phi::dtype::complex(real, imag); +} + +template <> +__forceinline__ __device__ phi::dtype::complex CudaShuffleXorSync( + unsigned mask, phi::dtype::complex val, int width) { + double real = static_cast( + __shfl_xor_sync(mask, static_cast(val.real), width)); + double imag = static_cast( + __shfl_xor_sync(mask, static_cast(val.imag), width)); + return phi::dtype::complex(real, imag); +} + +template +__forceinline__ __device__ T +CudaShuffleSync(unsigned mask, T val, int src_line, int width = 32) { + return __shfl_sync(mask, val, src_line, width); +} + +template +HOSTDEVICE T Infinity() { + return INFINITY; +} +#endif + +template +__device__ T reduceSum(T val, int tid, int len) { + // NOTE(zcd): The warp size should be taken from the + // parameters of the GPU but not specified as 32 simply. + // To make the reduceSum more efficiently, + // I use Warp-Level Parallelism and assume the Warp size + // is 32 which may be different for different GPU, + // but most card's warp size is 32. + const int warpSize = 32; + __shared__ T shm[warpSize]; + unsigned mask = 0u; + CREATE_SHFL_MASK(mask, tid < len); + + for (int offset = warpSize / 2; offset > 0; offset /= 2) + val += phi::backends::gpu::CudaShuffleDownSync(mask, val, offset); + + if (tid < warpSize) shm[tid] = 0; + __syncthreads(); + + if (tid % warpSize == 0) { + shm[tid / warpSize] = val; + } + __syncthreads(); + + CREATE_SHFL_MASK(mask, tid < warpSize); + + if (tid < warpSize) { + val = shm[tid]; + for (int offset = warpSize / 2; offset > 0; offset /= 2) + val += phi::backends::gpu::CudaShuffleDownSync(mask, val, offset); + } + return val; +} +} // namespace gpu +} // namespace backends +} // namespace phi + diff --git a/paddle/phi/backends/gpu/musa/musa_helper.h b/paddle/phi/backends/gpu/musa/musa_helper.h index e69de29bb2d1d..57135ac49d905 100644 --- a/paddle/phi/backends/gpu/musa/musa_helper.h +++ b/paddle/phi/backends/gpu/musa/musa_helper.h @@ -0,0 +1,34 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +namespace phi { +namespace backends { +namespace gpu { + +#define CUDNN_VERSION_MIN(major, minor, patch) \ + (0 >= ((major)*1000 + (minor)*100 + (patch))) + +#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \ + int64_t __index__ = \ + static_cast(blockIdx.x) * blockDim.x + threadIdx.x; \ + int64_t __stride__ = static_cast(blockDim.x) * gridDim.x; \ + for (index_type i = __index__; __index__ < (num); \ + __index__ += __stride__, i = __index__) + +} // namespace gpu +} // namespace backends +} // namespace phi + diff --git a/paddle/phi/common/.float16.h.swp b/paddle/phi/common/.float16.h.swp new file mode 100644 index 0000000000000000000000000000000000000000..77f57398adbb1a6c5a54b0e5dc7285fccf261999 GIT binary patch literal 16384 zcmeI3U5q4E6~~K3Se9J`g+yNrmsuRU+1Z}%p4r~rc6PLxnPocc&aU&ZyF^UocGd0f z!s@D~K6-k91tSj{NCYL(1mlAtMl>-C5`7{NeDK9+6vg0!A2HFy=!+UqK*0aGb!(=& zr+Yru#Dt_K`LQ!q=bn4+x%ZxX&h4g_KUY1?j^zp(es0h-ckw0j?VeDfz;)V0v5=>R2S;vUH-2#CGIXR(QV*maNIj5xAoW1%fz$)32T~8D9(b>M zAZYK?9)j_E)Bx^R|6bAe@4M>fQT6wEu}df!#=9c6H=|4sG&arOSseed5;?_;|=S4M(yd-bQrRr`PRy;qOZe^L*m9!Ncq zdLZ>c>Veb)sRvRIq#j5;ka{5XzrJo&#S7kAsK7=fO0{fDeF|4rto5;G5tPum*f^H&_7+pbU_>ZW2|NM50`3D%a0=WBCc%fm4d8n4`)f7r$KVop8hjf(20B0o7r=RN5*!9M zf&*YbczvIy{Rlh{9s^$lUjXyq4saBF6x<4adyS@j6?_WZ3a$ii?$xxvfxm)FzyqHK zlVA_{!`1K?_&RtJJOVs01^#pu#t0U{Dewt!Gx#vL3HkmLtrJuAeNFW^_HvxML=s zo~_Kzo|@CU$4YpoX$(JIc7omJ3CjtD*Y-r98{7}3Li}AQ>A`K&nNd&p*^f(E_3aKl zV0eahod)D>?g+hJC}lcEC!k;S`b04kwA$G*mQOaED0U~&6mO`hZ5Hm;&9E({M?2hx zXzw7DvFEGH^ZG(%xq5C+KY6xL+OgeaLS%Q7+jZ5iuFF9b=}{)?mAsJI@Z5a$?2L5X z0}muTcY0;1vi>+BJ+m@f(JPBH^W^N&QKsvOCv5rzo5<(0S?yqAa@4q_oE#9fyBiLk zKeMzvJ9n-+Gsi5)#zZpFZQGV%1x$H9BmZS&H^WvxC>+zOqvb^~Y$^@c@dK8e(57_D z4tk54uFv`?Yf`t2jgQ9$VLmM%i;QvG;y$ap9z0NQ7fO*-^t%*c4iBu3U~(?7&tI$*0Ce3X2@HDB}Jk@duV_jugVofE- zMypG{9BAb*jU+r&<+j~nq&r>$G)6W1y)(1Eh&8N<8Cq#xXauh3Q*e1OUbR_lcuXe` z?SM9jS!W(!h6=MG{* z1E&$=0kZ{86^8xJS@CdfEfB2chK|X-wK2BZw2UT6nDCs-M7@rXC!B!PtoWj5WOK7L zHj;Au5R(uzpKJX$O@WwYQ_m=w=ZUO<~wH!5Qh4Oqb3C zVI$rQZ^8qtomvn6SZe3_COhn`&C^aUkPyt~uS` z)yCdcds^4k*|#yCbzSs*&mAcmbY>3FXN2ai^^w8R8H_j~txv@BqxmEFW}G?^53buC z`@jpGz-kHUhRtUS#vQj!o+&bJQ*1_1kDW@67@|c=Dib!TA$t>QX#7Le^h%CIU$|9Y zvh@VMg^SQ4I+lUKjKJ!3%WSJ<1wN*E{fPxpPw-aqnbG0_)X7Y8W1@UwlM}60 z3lg5?L_*1BtLsZkb46WG={2-}5hxwi#vcHVI< zlTlj15VGqCdjYn1uquSZ$dNMIGz_CqTwmk5uJ)kJbrE=3VqsbnQ39MiSoPETh0N2t%{B@F57VctqRu^W+n7|nTTaZ#Bc}HCLnAJjdDNJdSX<@0z zXk%_Rni9CA0L5{KjZ~Icb!mj%URkOxjmc>3*reJM_lA~-z+b~D(xwcS<5a+)R;gFs z@I-WQS`J(FtN^1MW4>DtR&lIC4QZmEW)QT?(X1TDDWvKm#YBYirJR0c?Z1k)~Ghi7b z%ZmA8k)4%av5)@CDq+w>K9yq?oNJ3D=(9!P3$H^nNA>^BxBz$&wIJ31@#o*qq1JyG z90PBm&VLqkz&&6Nyoi%cICGZpo!Fg~87zYQy3#j4052)^M0s%~L0p!4ba1GcC zeu&!r2fzdd$bkdkTCfkijr#oO;9KA!Ks9^;Tmyc$54M1N!7O+KwfM{6yWnA9fZM@G zz`Ll!Uj@&AM}Pq;;3jY*_#bDk6aN1-_!amip!s+Kq(57F;G#SxH}W$aMVdy% zEG{BK&tb^U%4M0fmCL;5XE?Ua=AtDj6{mqD9i{gJ-i|UZ>*tG9jYPHcfDTdV>2b#G zM%MQrt!g0Npj^i^EmCn2F?WjP4Np>d2y+wyUQf{P}PD5Yd z3;(A+5$$mg^uP3H>}rgbgIg{uSle!RY7#Q}+>u>)(4)dc7oO*O+jTH>g-q^hJfiJe zwtM+2q*kx*e0-S}?`AJ|VD(*f#Zcm+>b=~~cICtjwpcVQY!$UXqWxcUT~yzev*sW- z(Q5*CR08)Ab(FAz_SFDkkGJW16Xot^2_(WuGP*kQDAz%iTF66ka+9KdF$fh(gQYB| zSJ~kq+7fcmcDOJxP@Tt3i|%xb1K*P6rqw{5AniUfP$(fVxQ|GHhE&+3K)M_E9xc}l zZ5J|II2vN|8UX3wc5;?5jvP5YR7~Es$o;NI`_~WFkjy22KY^LXl}z*~aCJu@-Sp*! z)aE77?~@~6SmQA;L?+>CF%8myRBp# zDz_BGG>9%oO*L$k@tk5&iw0_l?c``Qa!!X21AVIdF6+L8sJdcu!j|wXLyt<@sR2j$ zG;VDIfdm~_JZuW}?2xR%^5t0}Wa^-9)zxs+i*2)n-d6DFc8SHH>;<>oXo&m;`G(f3 z(c#YJX{3fqqs(MBFJpXA5GI11@-BQII?){M%(a8pZLB<+e`GA6MTick63=OEL~`ZG4!Pz%ZtHTL za3`43Hijyr|u*PIc`y4hz2t^{!J>WA{)xE|W}nk=DFRLh$Hr(+nhZkCRW zJ|>_tTV-iEHZM?W<2qO{y7cfYTw!7iFp1&bB6(##bLdc^K(_?Ar$&rN5J#-QO)qJa zMp1PSbEvp)dv1p`aR=6fe&Dt!-caLtBq!(xR}bo@o_c}?vJ?MDeV)-p8#4|aibsLV zczJ8@GZ>KX%1bB9Csndjd`gEqgqn++cI9^|(Kny!vZY?fkdR406Z(&495uf3h5879 cK3QXka4^cX@i8uaREFQ>@L2;Nra9Vw0VR%^C;$Ke literal 0 HcmV?d00001 diff --git a/paddle/phi/common/bfloat16.h b/paddle/phi/common/bfloat16.h index 4cc21a14faac8..73714adbf450c 100644 --- a/paddle/phi/common/bfloat16.h +++ b/paddle/phi/common/bfloat16.h @@ -26,6 +26,15 @@ #include #endif +#ifdef PADDLE_WITH_MUSA +#include +#endif +#if defined(__MUSACC__) +#define PADDLE_MUSA_BF16 +#include +#endif + + #if defined(__CUDACC__) && CUDA_VERSION >= 11000 #define PADDLE_CUDA_BF16 #include @@ -63,7 +72,7 @@ struct PADDLE_ALIGN(2) bfloat16 { x = res >> 16; #elif defined(PADDLE_WITH_MUSA) #if defined(PADDLE_MUSA_BF16) - __nv_bfloat16 tmp = __float2bfloat16(val); + __mt_bfloat16 tmp = __float2bfloat16(val); x = *reinterpret_cast(&tmp); #else std::memcpy(&x, reinterpret_cast(&val) + 2, 2); @@ -163,7 +172,7 @@ struct PADDLE_ALIGN(2) bfloat16 { return res; #elif defined(PADDLE_WITH_MUSA) #ifdef PADDLE_MUSA_BF16 - return __bfloat162float(*reinterpret_cast(&x)); + return __bfloat162float(*reinterpret_cast(&x)); #else float val = 0.f; uint16_t temp = x; @@ -190,6 +199,12 @@ struct PADDLE_ALIGN(2) bfloat16 { } #endif +#ifdef PADDLE_MUSA_BF16 + HOSTDEVICE inline __mt_bfloat16 to_mt_bfloat16() const { + return *reinterpret_cast(&x); + } +#endif + HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; } HOSTDEVICE inline explicit operator int8_t() const { diff --git a/paddle/phi/common/complex.h b/paddle/phi/common/complex.h index 43e513146ba0a..f4c5be53660aa 100644 --- a/paddle/phi/common/complex.h +++ b/paddle/phi/common/complex.h @@ -201,7 +201,7 @@ template HOSTDEVICE inline complex operator+(const complex& a, const complex& b) { #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__)) return complex(thrust::complex(a) + thrust::complex(b)); #else return complex(a.real + b.real, a.imag + b.imag); @@ -212,7 +212,7 @@ template HOSTDEVICE inline complex operator-(const complex& a, const complex& b) { #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__)) return complex(thrust::complex(a) - thrust::complex(b)); #else return complex(a.real - b.real, a.imag - b.imag); @@ -223,7 +223,7 @@ template HOSTDEVICE inline complex operator*(const complex& a, const complex& b) { #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__)) return complex(thrust::complex(a) * thrust::complex(b)); #else return complex(a.real * b.real - a.imag * b.imag, @@ -235,7 +235,7 @@ template HOSTDEVICE inline complex operator/(const complex& a, const complex& b) { #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__)) return complex(thrust::complex(a) / thrust::complex(b)); #else T denominator = b.real * b.real + b.imag * b.imag; @@ -247,7 +247,7 @@ HOSTDEVICE inline complex operator/(const complex& a, template HOSTDEVICE inline complex operator-(const complex& a) { #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__)) return complex(-thrust::complex(a.real, a.imag)); #else complex res; @@ -261,7 +261,7 @@ template HOSTDEVICE inline complex& operator+=(complex& a, // NOLINT const complex& b) { #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__)) a = complex(thrust::complex(a.real, a.imag) += thrust::complex(b.real, b.imag)); return a; @@ -276,7 +276,7 @@ template HOSTDEVICE inline complex& operator-=(complex& a, // NOLINT const complex& b) { #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__)) a = complex(thrust::complex(a.real, a.imag) -= thrust::complex(b.real, b.imag)); return a; @@ -291,7 +291,7 @@ template HOSTDEVICE inline complex& operator*=(complex& a, // NOLINT const complex& b) { #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__)) a = complex(thrust::complex(a.real, a.imag) *= thrust::complex(b.real, b.imag)); return a; @@ -306,7 +306,7 @@ template HOSTDEVICE inline complex& operator/=(complex& a, // NOLINT const complex& b) { #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__)) a = complex(thrust::complex(a.real, a.imag) /= thrust::complex(b.real, b.imag)); return a; @@ -369,7 +369,7 @@ HOSTDEVICE inline complex(min)(const complex& a, const complex& b) { template HOSTDEVICE inline bool(isnan)(const complex& a) { #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__)) return ::isnan(a.real) || ::isnan(a.imag); #else return std::isnan(a.real) || std::isnan(a.imag); @@ -379,7 +379,7 @@ HOSTDEVICE inline bool(isnan)(const complex& a) { template HOSTDEVICE inline bool isinf(const complex& a) { #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__)) return ::isinf(a.real) || ::isinf(a.imag); #else return std::isinf(a.real) || std::isinf(a.imag); @@ -389,7 +389,7 @@ HOSTDEVICE inline bool isinf(const complex& a) { template HOSTDEVICE inline bool isfinite(const complex& a) { #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__)) return ::isfinite(a.real) || ::isfinite(a.imag); #else return std::isfinite(a.real) || std::isfinite(a.imag); @@ -399,7 +399,7 @@ HOSTDEVICE inline bool isfinite(const complex& a) { template HOSTDEVICE inline T abs(const complex& a) { #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__)) return thrust::abs(thrust::complex(a)); #else return std::abs(std::complex(a)); @@ -409,7 +409,7 @@ HOSTDEVICE inline T abs(const complex& a) { template HOSTDEVICE inline T arg(const complex& a) { #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__)) return thrust::arg(thrust::complex(a)); #else return std::arg(std::complex(a)); @@ -419,7 +419,7 @@ HOSTDEVICE inline T arg(const complex& a) { template HOSTDEVICE inline complex pow(const complex& a, const complex& b) { #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__)) return complex(thrust::pow(thrust::complex(a), thrust::complex(b))); #else return complex(std::pow(std::complex(a), std::complex(b))); @@ -429,7 +429,7 @@ HOSTDEVICE inline complex pow(const complex& a, const complex& b) { template HOSTDEVICE inline complex sqrt(const complex& a) { #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__)) return complex(thrust::sqrt(thrust::complex(a))); #else return complex(std::sqrt(std::complex(a))); @@ -439,7 +439,7 @@ HOSTDEVICE inline complex sqrt(const complex& a) { template HOSTDEVICE inline complex tanh(const complex& a) { #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__)) return complex(thrust::tanh(thrust::complex(a))); #else return complex(std::tanh(std::complex(a))); @@ -449,7 +449,7 @@ HOSTDEVICE inline complex tanh(const complex& a) { template HOSTDEVICE inline complex log(const complex& a) { #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__)) return complex(thrust::log(thrust::complex(a))); #else return complex(std::log(std::complex(a))); diff --git a/paddle/phi/common/cpstring_impl.h b/paddle/phi/common/cpstring_impl.h index 6783799026d44..cbbd632aa2484 100644 --- a/paddle/phi/common/cpstring_impl.h +++ b/paddle/phi/common/cpstring_impl.h @@ -26,7 +26,7 @@ limitations under the License. */ #include "paddle/phi/core/macros.h" -#if (defined(__NVCC__) || defined(__HIPCC__)) +#if (defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)) #define HOSTDEVICE __host__ __device__ #define DEVICE __device__ #define HOST __host__ @@ -77,7 +77,7 @@ HOSTDEVICE static inline uint32_t swap32(uint32_t host_int) { } #endif -#if PD_PSTRING_LITTLE_ENDIAN || (defined(__NVCC__) || defined(__HIPCC__)) +#if PD_PSTRING_LITTLE_ENDIAN || (defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)) #define PD_le32toh(x) x #else // PD_PSTRING_LITTLE_ENDIAN #define PD_le32toh(x) swap32(x) @@ -209,7 +209,7 @@ HOSTDEVICE static inline void *PD_Malloc(size_t size) { return malloc(size); } HOSTDEVICE static inline void *PD_Realloc(void *ptr, size_t old_size UNUSED, size_t new_size) { -#if (defined(__NVCC__) || defined(__HIPCC__)) +#if (defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)) if (old_size >= new_size) { return ptr; } diff --git a/paddle/phi/common/float16.h b/paddle/phi/common/float16.h index 572f460197f08..00de1bf605157 100644 --- a/paddle/phi/common/float16.h +++ b/paddle/phi/common/float16.h @@ -50,6 +50,11 @@ #include #endif +#if defined(__MUSACC__) +#define PADDLE_CUDA_FP16 +#include +#endif + #ifdef __HIPCC__ #define PADDLE_CUDA_FP16 #include @@ -87,7 +92,7 @@ struct PADDLE_ALIGN(2) float16 { #ifdef PADDLE_CUDA_FP16 HOSTDEVICE inline explicit float16(const half& h) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) -#if defined(PADDLE_WITH_HIP) || CUDA_VERSION >= 9000 +#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || CUDA_VERSION >= 9000 x = reinterpret_cast<__half_raw*>(const_cast(&h))->x; #else x = h.x; @@ -106,7 +111,7 @@ struct PADDLE_ALIGN(2) float16 { HOSTDEVICE inline explicit float16(float val) { #if defined(PADDLE_CUDA_FP16) && \ - (defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300)) + (defined(__HIPCC__) || defined(__MUSACC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300)) half tmp = __float2half(val); x = *reinterpret_cast(&tmp); @@ -148,7 +153,7 @@ struct PADDLE_ALIGN(2) float16 { // Assignment operators #ifdef PADDLE_CUDA_FP16 HOSTDEVICE inline float16& operator=(const half& rhs) { -#if defined(PADDLE_WITH_HIP) || CUDA_VERSION >= 9000 +#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || CUDA_VERSION >= 9000 x = reinterpret_cast<__half_raw*>(const_cast(&rhs))->x; #else x = rhs.x; @@ -222,7 +227,7 @@ struct PADDLE_ALIGN(2) float16 { // Conversion operators #ifdef PADDLE_CUDA_FP16 HOSTDEVICE inline half to_half() const { -#if defined(PADDLE_WITH_HIP) || CUDA_VERSION >= 9000 +#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || CUDA_VERSION >= 9000 __half_raw h; h.x = x; return half(h); @@ -242,7 +247,7 @@ struct PADDLE_ALIGN(2) float16 { HOSTDEVICE inline operator float() const { #if defined(PADDLE_CUDA_FP16) && \ - (defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300)) + (defined(__HIPCC__) || defined(__MUSACC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300)) half tmp = *reinterpret_cast(this); return __half2float(tmp); @@ -399,7 +404,7 @@ DEVICE inline half operator-(const half& a) { #endif } -#ifndef PADDLE_WITH_HIP // not defined __HIP_NO_HALF_OPERATORS__ +#ifdef PADDLE_WITH_CUDA // not defined __HIP_NO_HALF_OPERATORS__ DEVICE inline half& operator+=(half& a, const half& b) { // NOLINT a = a + b; return a; @@ -1014,13 +1019,13 @@ struct is_pod { is_standard_layout::value; }; -template <> -struct is_floating_point - : std::integral_constant< - bool, - std::is_same< - phi::dtype::float16, - typename std::remove_cv::type>::value> {}; +//template <> +//struct is_floating_point +// : std::integral_constant< +// bool, +// std::is_same< +// phi::dtype::float16, +// typename std::remove_cv::type>::value> {}; template <> struct is_signed { static const bool value = true; diff --git a/paddle/phi/common/scalar.h b/paddle/phi/common/scalar.h index 4286dfcc1d0fa..c8ced345a637a 100644 --- a/paddle/phi/common/scalar.h +++ b/paddle/phi/common/scalar.h @@ -140,10 +140,10 @@ class ScalarBase { return static_cast(data_.f32); case DataType::FLOAT64: return static_cast(data_.f64); - case DataType::FLOAT16: - return static_cast(data_.f16); - case DataType::BFLOAT16: - return static_cast(data_.bf16); + //case DataType::FLOAT16: + // return static_cast(data_.f16); + //case DataType::BFLOAT16: + // return static_cast(data_.bf16); case DataType::INT32: return static_cast(data_.i32); case DataType::INT64: @@ -162,10 +162,10 @@ class ScalarBase { return static_cast(data_.ui8); case DataType::BOOL: return static_cast(data_.b); - case DataType::COMPLEX64: - return static_cast(data_.c64); - case DataType::COMPLEX128: - return static_cast(data_.c128); + //case DataType::COMPLEX64: + // return static_cast(data_.c64); + //case DataType::COMPLEX128: + // return static_cast(data_.c128); default: PD_THROW("Invalid enum scalar data type `", dtype_, "`."); } diff --git a/paddle/phi/common/transform.h b/paddle/phi/common/transform.h index e80561284b885..620d3d683fbf0 100644 --- a/paddle/phi/common/transform.h +++ b/paddle/phi/common/transform.h @@ -21,7 +21,7 @@ limitations under the License. */ #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/hostdevice.h" -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) #include #include #include "thrust/device_ptr.h" @@ -92,7 +92,7 @@ struct Transform { } }; -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) // PointerToThrustDevicePtr has two specializations, one casts a (CUDA // device) pointer into thrust::device_ptr, the other keeps rest types @@ -153,6 +153,12 @@ struct Transform { CastToCUDATransformIterator(last), CastToCUDATransformIterator(result), op); +#elif defined(__MUSACC__) + thrust::transform(thrust::musa::par.on(context.stream()), + CastToCUDATransformIterator(first), + CastToCUDATransformIterator(last), + CastToCUDATransformIterator(result), + op); #else thrust::transform(thrust::cuda::par.on(context.stream()), CastToCUDATransformIterator(first), @@ -184,6 +190,13 @@ struct Transform { CastToCUDATransformIterator(first2), CastToCUDATransformIterator(result), op); +#elif defined(__MUSACC__) + thrust::transform(thrust::musa::par.on(context.stream()), + CastToCUDATransformIterator(first1), + CastToCUDATransformIterator(last1), + CastToCUDATransformIterator(first2), + CastToCUDATransformIterator(result), + op); #else thrust::transform(thrust::cuda::par.on(context.stream()), CastToCUDATransformIterator(first1), diff --git a/paddle/phi/core/enforce.h b/paddle/phi/core/enforce.h index aaa3eebfe27a5..f07a763ac52d0 100644 --- a/paddle/phi/core/enforce.h +++ b/paddle/phi/core/enforce.h @@ -414,6 +414,17 @@ struct EnforceNotMet : public std::exception { abort(); \ } \ } while (0) +#elif defined(__MUSACC__) +#define PADDLE_ENFORCE(_IS_NOT_ERROR, __FORMAT, ...) \ + do { \ + if (!(_IS_NOT_ERROR)) { \ + printf("Error: %s:%d Assertion `%s` failed. " __FORMAT "\n", \ + __FILE__, \ + __LINE__, \ + #_IS_NOT_ERROR, \ + ##__VA_ARGS__); \ + } \ + } while (0) #else #define PADDLE_ENFORCE(COND, ...) \ do { \ diff --git a/paddle/phi/core/hostdevice.h b/paddle/phi/core/hostdevice.h index decebbe66a538..81e663fa20df6 100644 --- a/paddle/phi/core/hostdevice.h +++ b/paddle/phi/core/hostdevice.h @@ -18,6 +18,10 @@ #include #endif +#ifdef __MUSACC__ +#include +#endif + #if defined(__xpu__) #include @@ -26,7 +30,7 @@ #include "xpu/kernel/math.h" #endif -#if (defined(__CUDACC__) || defined(__HIPCC__) || defined(__xpu__)) +#if (defined(__CUDACC__) || defined(__HIPCC__) || defined(__MUSACC__) || defined(__xpu__)) #define HOSTDEVICE __host__ __device__ #define DEVICE __device__ #define HOST __host__ diff --git a/paddle/phi/core/macros.h b/paddle/phi/core/macros.h index 2e78357492734..f3dae52b04387 100644 --- a/paddle/phi/core/macros.h +++ b/paddle/phi/core/macros.h @@ -53,7 +53,7 @@ namespace phi { #define PD_CONCATENATE2(arg1, arg2) arg1##arg2 #define PD_EXPAND(x) x -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) #define PADDLE_RESTRICT __restrict__ #else #define PADDLE_RESTRICT diff --git a/paddle/phi/core/visit_type.h b/paddle/phi/core/visit_type.h index f96fdb1f28b63..d72046a82e0cb 100644 --- a/paddle/phi/core/visit_type.h +++ b/paddle/phi/core/visit_type.h @@ -281,17 +281,9 @@ namespace phi { PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::INT16, int16_t, __VA_ARGS__) \ PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::INT32, int32_t, __VA_ARGS__) \ PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::INT64, int64_t, __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE( \ - NAME, ::phi::DataType::BFLOAT16, phi::bfloat16, __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE( \ - NAME, ::phi::DataType::FLOAT16, phi::float16, __VA_ARGS__) \ PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::FLOAT32, float, __VA_ARGS__) \ PD_PRIVATE_CASE_TYPE( \ NAME, ::phi::DataType::FLOAT64, double, __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE( \ - NAME, ::phi::DataType::COMPLEX64, phi::complex64, __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE( \ - NAME, ::phi::DataType::COMPLEX128, phi::complex128, __VA_ARGS__) \ default: \ PADDLE_THROW(phi::errors::InvalidArgument( \ "Invalid enum data type `%d`.", static_cast(__dtype__))); \ diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index 4b21e61f8d88c..e57ac9d80f6c1 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -26,19 +26,23 @@ file(GLOB kernel_impl_h "impl/*.h" "selected_rows/impl/*.h") file(GLOB kernel_primitive_h "primitive/*.h") # fusion ops would be included here +#file( +# GLOB kernel_cu +# RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" +# "gpu/*.cu" +# "gpu/*.cu.cc" +# "gpudnn/*.cu" +# "kps/*.cu" +# "legacy/kps/*.cu" +# "legacy/gpu/*.cu" +# "selected_rows/gpu/*.cu" +# "sparse/gpu/*.cu" +# "strings/gpu/*.cu" +# "fusion/gpu/*.cu") file( GLOB kernel_cu RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" - "gpu/*.cu" - "gpu/*.cu.cc" - "gpudnn/*.cu" - "kps/*.cu" - "legacy/kps/*.cu" - "legacy/gpu/*.cu" - "selected_rows/gpu/*.cu" - "sparse/gpu/*.cu" - "strings/gpu/*.cu" - "fusion/gpu/*.cu") + "gpu/a*.cu") if(APPLE OR WIN32) list(REMOVE_ITEM kernel_cu "fusion/gpu/fusion_group_kernel.cu") diff --git a/paddle/phi/kernels/activation_kernel.cc b/paddle/phi/kernels/activation_kernel.cc index 9626621ae8657..0b324d584e4d4 100644 --- a/paddle/phi/kernels/activation_kernel.cc +++ b/paddle/phi/kernels/activation_kernel.cc @@ -32,7 +32,7 @@ using complex128 = ::phi::dtype::complex; PD_REGISTER_KERNEL(relu6, CPU, ALL_LAYOUT, phi::Relu6Kernel, float, double) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA) PD_REGISTER_KERNEL(relu6, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/assign_kernel.cc b/paddle/phi/kernels/assign_kernel.cc index c44b6333154cc..425ce19808ea4 100644 --- a/paddle/phi/kernels/assign_kernel.cc +++ b/paddle/phi/kernels/assign_kernel.cc @@ -135,7 +135,7 @@ PD_REGISTER_KERNEL(assign_value, int8_t, int64_t) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA) PD_REGISTER_KERNEL_FOR_ALL_DTYPE(assign, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/batch_norm_kernel.cc b/paddle/phi/kernels/batch_norm_kernel.cc index dba08b0de366a..3f9050af76d8a 100644 --- a/paddle/phi/kernels/batch_norm_kernel.cc +++ b/paddle/phi/kernels/batch_norm_kernel.cc @@ -97,7 +97,7 @@ PD_REGISTER_KERNEL(batch_norm_infer, } #endif #endif -#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA) PD_REGISTER_KERNEL(batch_norm_infer, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/check_memory_continue_kernel.cc b/paddle/phi/kernels/check_memory_continue_kernel.cc index 9f4b51281cd37..87dcd2eaa01ac 100644 --- a/paddle/phi/kernels/check_memory_continue_kernel.cc +++ b/paddle/phi/kernels/check_memory_continue_kernel.cc @@ -88,7 +88,7 @@ PD_REGISTER_KERNEL(check_memory_continue, float, double) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA) PD_REGISTER_KERNEL(check_memory_continue, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/coalesce_tensor_kernel.cc b/paddle/phi/kernels/coalesce_tensor_kernel.cc index 58cacd21bba18..8a694bec4a9b8 100644 --- a/paddle/phi/kernels/coalesce_tensor_kernel.cc +++ b/paddle/phi/kernels/coalesce_tensor_kernel.cc @@ -292,7 +292,7 @@ PD_REGISTER_KERNEL(coalesce_tensor, } #endif -#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA) PD_REGISTER_KERNEL(coalesce_tensor, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/cpu/activation_grad_kernel.cc b/paddle/phi/kernels/cpu/activation_grad_kernel.cc index 7b9074ffa92f3..d47c98608c91f 100644 --- a/paddle/phi/kernels/cpu/activation_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/activation_grad_kernel.cc @@ -257,7 +257,7 @@ PD_REGISTER_KERNEL( #define PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(name, func) \ PD_REGISTER_KERNEL( \ - name, CPU, ALL_LAYOUT, phi::func, float, double, phi::dtype::float16) {} + name, CPU, ALL_LAYOUT, phi::func, float, double) {} PD_REGISTER_ACTIVATION_GRAD_KERNEL(sin_grad, SinGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(cos_grad, CosGradKernel) diff --git a/paddle/phi/kernels/cpu/activation_kernel.cc b/paddle/phi/kernels/cpu/activation_kernel.cc index 046cee5857808..62ae48766057c 100644 --- a/paddle/phi/kernels/cpu/activation_kernel.cc +++ b/paddle/phi/kernels/cpu/activation_kernel.cc @@ -191,8 +191,8 @@ PD_REGISTER_KERNEL(exp, float, double, int, - int64_t, - phi::dtype::float16) {} + int64_t) {} + //phi::dtype::float16) {} PD_REGISTER_KERNEL(expm1, CPU, @@ -201,8 +201,8 @@ PD_REGISTER_KERNEL(expm1, float, double, int, - int64_t, - phi::dtype::float16) {} + int64_t) {} + //phi::dtype::float16) {} PD_REGISTER_KERNEL(logit, CPU, ALL_LAYOUT, phi::LogitKernel, float, double) {} PD_REGISTER_KERNEL( @@ -220,9 +220,9 @@ PD_REGISTER_KERNEL(log, float, double, int, - int64_t, - phi::dtype::float16, - phi::dtype::bfloat16) {} + int64_t) {} + //phi::dtype::float16, + //phi::dtype::bfloat16) {} PD_REGISTER_KERNEL(log2, CPU, ALL_LAYOUT, @@ -230,9 +230,9 @@ PD_REGISTER_KERNEL(log2, float, double, int, - int64_t, - phi::dtype::float16, - phi::dtype::bfloat16) {} + int64_t) {} + //phi::dtype::float16, + //phi::dtype::bfloat16) {} PD_REGISTER_KERNEL(log10, CPU, ALL_LAYOUT, @@ -240,9 +240,9 @@ PD_REGISTER_KERNEL(log10, float, double, int, - int64_t, - phi::dtype::float16, - phi::dtype::bfloat16) {} + int64_t) {} + //phi::dtype::float16, + //phi::dtype::bfloat16) {} PD_REGISTER_KERNEL(log1p, CPU, ALL_LAYOUT, @@ -250,9 +250,9 @@ PD_REGISTER_KERNEL(log1p, float, double, int, - int64_t, - phi::dtype::float16, - phi::dtype::bfloat16) {} + int64_t) {} + //phi::dtype::float16, + //phi::dtype::bfloat16) {} PD_REGISTER_ACTIVATION_KERNEL(hardswish, HardSwishKernel) PD_REGISTER_ACTIVATION_KERNEL(round, RoundKernel) diff --git a/paddle/phi/kernels/cpu/cast_grad_kernel.cc b/paddle/phi/kernels/cpu/cast_grad_kernel.cc index 403caf997dbf7..fad74ef9e7ce9 100644 --- a/paddle/phi/kernels/cpu/cast_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/cast_grad_kernel.cc @@ -25,9 +25,9 @@ void CastGradKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& out_grad, DenseTensor* x_grad) { - PD_VISIT_ALL_TYPES(x.dtype(), "CastKernelImpl", ([&] { - CastKernelImpl(dev_ctx, out_grad, x_grad); - })); + //PD_VISIT_ALL_TYPES(x.dtype(), "CastKernelImpl", ([&] { + // CastKernelImpl(dev_ctx, out_grad, x_grad); + // })); } } // namespace phi diff --git a/paddle/phi/kernels/dist_grad_kernel.cc b/paddle/phi/kernels/dist_grad_kernel.cc index 442290c3648e2..658135c36fd72 100644 --- a/paddle/phi/kernels/dist_grad_kernel.cc +++ b/paddle/phi/kernels/dist_grad_kernel.cc @@ -97,7 +97,7 @@ void DistGradKernel(const Context& dev_ctx, PD_REGISTER_KERNEL( dist_grad, CPU, ALL_LAYOUT, phi::DistGradKernel, float, double) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA) PD_REGISTER_KERNEL( dist_grad, GPU, ALL_LAYOUT, phi::DistGradKernel, float, double) {} #endif diff --git a/paddle/phi/kernels/empty_kernel.cc b/paddle/phi/kernels/empty_kernel.cc index 54449200ae4b2..76377d201e274 100644 --- a/paddle/phi/kernels/empty_kernel.cc +++ b/paddle/phi/kernels/empty_kernel.cc @@ -74,7 +74,7 @@ PD_REGISTER_KERNEL(empty_like, kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA) PD_REGISTER_KERNEL(empty, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/flatten_grad_kernel.cc b/paddle/phi/kernels/flatten_grad_kernel.cc index 3ecef871d211d..595f38e03910f 100644 --- a/paddle/phi/kernels/flatten_grad_kernel.cc +++ b/paddle/phi/kernels/flatten_grad_kernel.cc @@ -46,7 +46,7 @@ PD_REGISTER_KERNEL(flatten_grad, int, int64_t) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA) PD_REGISTER_KERNEL(flatten_grad, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/flatten_kernel.cc b/paddle/phi/kernels/flatten_kernel.cc index 6b22ac7518179..b7b41782ba092 100644 --- a/paddle/phi/kernels/flatten_kernel.cc +++ b/paddle/phi/kernels/flatten_kernel.cc @@ -75,7 +75,7 @@ PD_REGISTER_KERNEL(flatten, int, int64_t) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA) PD_REGISTER_KERNEL(flatten_infer, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/full_kernel.cc b/paddle/phi/kernels/full_kernel.cc index 982b6a396c2a8..e709be621c8d6 100644 --- a/paddle/phi/kernels/full_kernel.cc +++ b/paddle/phi/kernels/full_kernel.cc @@ -49,7 +49,7 @@ PD_REGISTER_KERNEL(full_batch_size_like, bool) { kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA) PD_REGISTER_KERNEL(full_batch_size_like, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/funcs/.im2col.cu.swp b/paddle/phi/kernels/funcs/.im2col.cu.swp new file mode 100644 index 0000000000000000000000000000000000000000..530e1d87fe3d7b003699505266a4bf81f84ee1b8 GIT binary patch literal 16384 zcmeHNZHya786HY$DYQwHwn+S-nqEa@=N$X)66M2p=WyI5sYQBMQqiKo?5~xCai2kUMP=J;eeu0n(0`VyXLPG5iB!tQzghWAzKU72r)RvFuo!Rl) zzVq2pf(T+)dc2;Q_iNsHXLjb@T^u=6euN*+k1|}l7%Tkc`}UO&9_*hGpB1sJPA0ivJfHby-*ufhU}>u+;RbVL`fMRo}^1 zo7>4t+fomt9=IM4+{5;djgAnc z>Vf}D4_G#1yV3I(>7b1F|K|Mv!<~%%0C*CZ1nvj!1Af1Qu~&gF0vCb%fnVIi*w=xt z0iOdt4txwa4Lkt+>2Aip3w#RrBybs60hWQYKpwaY_}x396L=PA0LOv%0N38Z*w2A4 z08L;P$N;wkFW<%3Gr$}$4qSUXW4{Kz0z3)457+_x>Q2T!0jvVUz#r~l>;=FBjsS;& z*WSk1kAY_Z2^?-hO;2baqj01muD`T$!SAnO2r+`bq8qfqH;4xqlH~_qg zoWyT|Zv!E)3>1O8fmh&v8iNg<=6eez1p2m#9~LX>-3J0t@;x*^JgOBUWWqO#5CW)}uY7 zmi~ag(Oh(7waBWD6^6X*j!pZ{aeVDT#w!)-(~n}2s{O#VqGEAn`uJRqcSy|fg%ZES zBH=b*x8M_6To;Y5TqrTVw*bH!=cU(ZMwM#a^5{F09AA=7B!Ws^$XY#W71yL4)pOi$ zMnr8IS7gYKE2Y_IduB>G^qJn;lql@ok~<9{_F zP?}qHXisGAZJ7*@#Fy@!E@+$JCasWsraQuim4+<2v14$kL61{KdP!d{a}1oo;2SNR zM4`VVJ7G|xVtO>JI;2!|YE8t-C7w}TaI6gsG=FfC&tMQvsuE&w_TjVB**2}1XtCZl z8K_g`FuI4zai+$A%v0T^UTXn* z{fVjfar)RxdbRezB_`E#azF=@HkBt_vF$BgW{imov!fLvFtKAcC*>X!CJCVKe?^3% zE!O#ayFxN+#s%}HU$F3_v5-QW=k$NwqfH$`#3Xa4!=xew0U1>2G6)r|VadYWWUXj3 zwL{d=L{tyXmR)*a5FP!xsyI5dP@$*M#!xjZz3ntZ&@xl0(b1;Ieh|ZGUTaAc+-tfN zYoVC}7{x`$udZN$%daC`YIthSNH@QMncHgEpx9R>YFbBAR1bt@D<+Bu{lFu;v>qBm zD#ei*s&yReYvX`870BC)>{yt#rA(d{ zgOSy@B{L7%enbQG)kI#7RVp>dU$mS`MQ{G}XS(sm&%_ID4!gH!*oiUEaz)s%s)DPH z9-gR44fcO1vxlTtb(*%|L;5S>-d_gShqflJ`mXDH1B<5<|m)-l<=C0QF>RlGAMhNLeOdN*HctFfG#AIPZg!iNr7 zr?%@MZ{oQSU`Z1EBsSdGo#$z@J1<0g9jC1U(S!|Al}ob99m}gVt(xF9e-+W*5rG=K0(yK0P-(Q(h>~&7yRiPtBg=A1Kex z;1ll4WOD$i6uh6L+5B;TR%?gCxtJI^Y zQ7jbJ*4FYCg0L>~eo!kw5P2>voGecto1H(lZ!C|`W6$|b?phn%UyLkBw6M@OP+ID6 ztQ9eA7Zsi7N4$xhX;ep-FyRH9Q56k(EP$1qVR;DqH8`qWM4rV1*Y2DtWt&>04Mxhx+W85`q|s!#0oFP~~gbw3F6e9Ce7DIyH{ zDG`cbl}3op|2uIuJ&N-zo&S%;S&)};*8c|Z9Pnvi8u&ZT`hNp{1N;>DEN}_1fxW=J zz?(Sl{|NXh@EPEvzzQ%AJOZ2q?gM^;^Zpg!46qA$3FrK$fhpj1oa?^@1i(Jvk2u#q z4^)760lR^nzz*P-INQGnJO!Kp-VI#C`Tjcqo%cToJOI3g^ZoaLj{plm0r)G<_CEu@ z1k`{dz#YI#@WmBi8IZs{K>jHK>6LmQ^+3x5mpLyKc*k>`b=Rq1mT;YHd1n_Gj96#>#HL4Ab27@t#|xHa)Rzx0q?R zkDQvgRUD<9bE_EhpNOcza+d#}tRdf^5vF+xUeM2J3|-j2p6J zpre0(#}1Yl=r5=5L4P0-HykH0!=$}*pdk9jq>IH6b75j)f--D5oPH>QhPN*xIm({V zo1r|$L|ASnTPl^1NpSM5`J?67(%rA}bJVC4$;4|yJDy~UDweKOdlghw)^j?6rwSW5 z6VeMaS9)zEkLM_xZ)`V{mJ(Yk^`|%=NB(VeWb6QPZ%HNPWRucHoD(;TM_UV+_m7#) zOj1v;ElhpatBr(cgESx9n%UIH%#`o!2r527ynN3dlO5MNd{vv|{W^6W-da^$y`84R zTPZr+OV5F=^~BX%cICiU3Jw?rodZU08__B{8_4NsVErcrC{z}TUSFv5b(SNUfFnKP zHbmM&cu{`HgbRsNI}Q;9IBXY-WpCATq&*eXnl30aL#^PY42yX$l8G0_iX)cr4swW* u9J*T6or~^hXOSpdt0Es}R908U=T`O(4v literal 0 HcmV?d00001 diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt index f90147b013023..662b9275aa7aa 100644 --- a/paddle/phi/kernels/funcs/CMakeLists.txt +++ b/paddle/phi/kernels/funcs/CMakeLists.txt @@ -1,4 +1,4 @@ -#add_subdirectory(eigen) +add_subdirectory(eigen) add_subdirectory(blas) add_subdirectory(lapack) add_subdirectory(detail) diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h index 203f6837d4611..ef13b248f4c90 100644 --- a/paddle/phi/kernels/funcs/activation_functor.h +++ b/paddle/phi/kernels/funcs/activation_functor.h @@ -2566,7 +2566,7 @@ struct SquareGradGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; -#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || defined(__xpu__) template struct CudaLogitFunctor : public BaseActivationFunctor { diff --git a/paddle/phi/kernels/funcs/algorithm.h b/paddle/phi/kernels/funcs/algorithm.h index 5f66f6f1abd4d..4c4bf031b4338 100644 --- a/paddle/phi/kernels/funcs/algorithm.h +++ b/paddle/phi/kernels/funcs/algorithm.h @@ -40,7 +40,7 @@ HOSTDEVICE inline int64_t BinarySearch(const T *x, int64_t num, const T &val) { template HOSTDEVICE inline size_t LowerBound(const T1 *x, size_t num, const T2 &val) { -#if defined(__CUDA_ARCH__) || defined(__HIPCC__) // @{ Group LowerBound +#if defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__) // @{ Group LowerBound // The following code is from // https://en.cppreference.com/w/cpp/algorithm/lower_bound auto *first = x; @@ -63,7 +63,7 @@ HOSTDEVICE inline size_t LowerBound(const T1 *x, size_t num, const T2 &val) { template HOSTDEVICE inline size_t UpperBound(const T1 *x, size_t num, const T2 &val) { -#if defined(__CUDA_ARCH__) || defined(__HIPCC__) // @{ Group UpperBound +#if defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__) // @{ Group UpperBound // The following code is from // https://en.cppreference.com/w/cpp/algorithm/upper_bound auto *first = x; diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h index e754ce3bf49e4..5f19522d28f18 100644 --- a/paddle/phi/kernels/funcs/broadcast_function.h +++ b/paddle/phi/kernels/funcs/broadcast_function.h @@ -17,7 +17,7 @@ limitations under the License. */ #include #include "paddle/phi/kernels/funcs/elementwise_base.h" -#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || defined(__xpu__) #include "paddle/phi/kernels/funcs/dims_simplifier.h" namespace kps = phi::kps; @@ -27,7 +27,7 @@ namespace kps = phi::kps; namespace phi { namespace funcs { -#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || defined(__xpu__) enum BroadcastLoadType { kMixed = 1, kBroadcast = 2, kElementwise = 3 }; diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.cu b/paddle/phi/kernels/funcs/concat_and_split_functor.cu index 5a7574b56a891..3086d5dc4ed14 100644 --- a/paddle/phi/kernels/funcs/concat_and_split_functor.cu +++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cu @@ -21,6 +21,10 @@ limitations under the License. */ #include "paddle/phi/common/place.h" #include "paddle/phi/kernels/funcs/segmented_array.h" +#ifdef PADDLE_WITH_MUSA +#include "paddle/phi/backends/gpu/musa/musa_helper.h" +#endif + namespace phi { namespace funcs { diff --git a/paddle/phi/kernels/funcs/cross_entropy.cu b/paddle/phi/kernels/funcs/cross_entropy.cu index add838106bfe8..00e885eeac5a1 100644 --- a/paddle/phi/kernels/funcs/cross_entropy.cu +++ b/paddle/phi/kernels/funcs/cross_entropy.cu @@ -154,9 +154,11 @@ void CrossEntropyFunctor::operator()( template class CrossEntropyFunctor; template class CrossEntropyFunctor; template class CrossEntropyFunctor; -#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(8, 1, 0) +#if defined(PADDLE_WITH_CUDA) +#if CUDNN_VERSION_MIN(8, 1, 0) template class CrossEntropyFunctor; #endif +#endif } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/funcs/diagonal.h b/paddle/phi/kernels/funcs/diagonal.h index a30fb79f8c8b0..f0235f0baec5f 100644 --- a/paddle/phi/kernels/funcs/diagonal.h +++ b/paddle/phi/kernels/funcs/diagonal.h @@ -14,7 +14,7 @@ #pragma once -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) #include #include @@ -109,7 +109,7 @@ DenseTensor Diagonal(const DeviceContext& context, int64_t pos = std::abs(offset) * offset_stride; int64_t dim_size = ret_strides.size(); -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) thrust::device_vector diag_vec(vectorize(dig_stride)); const int64_t* diag_arr = thrust::raw_pointer_cast(diag_vec.data()); thrust::device_vector ret_vec(ret_strides); @@ -146,7 +146,7 @@ std::vector ComputeDimStride(const std::vector dim) { return dim_strides; } -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) template __global__ void DiagonalCuda(const T* data1, T* data2, diff --git a/paddle/phi/kernels/funcs/distribution_helper.h b/paddle/phi/kernels/funcs/distribution_helper.h index abade7ac0ef87..2ae5c912db937 100644 --- a/paddle/phi/kernels/funcs/distribution_helper.h +++ b/paddle/phi/kernels/funcs/distribution_helper.h @@ -17,6 +17,9 @@ limitations under the License. */ #ifdef __NVCC__ #include #endif +#ifdef __MUSACC__ +#include +#endif #ifdef __HIPCC__ #include #endif @@ -28,7 +31,7 @@ limitations under the License. */ #include "paddle/phi/core/generator.h" #include "paddle/phi/core/hostdevice.h" -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) #include "paddle/phi/kernels/funcs/index_impl.cu.h" #include "paddle/phi/kernels/primitive/kernel_primitives.h" #endif @@ -49,7 +52,7 @@ struct exponential_transform { explicit exponential_transform(T lambda) : lambda_(lambda) {} HOSTDEVICE inline T operator()(T val) const { -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) T log = -std::numeric_limits::epsilon() / 2; if (val < static_cast(1.) - std::numeric_limits::epsilon() / 2) { if (std::is_same::value) { @@ -113,7 +116,7 @@ struct normal_transform { T std_; }; -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) namespace kps = phi::kps; @@ -186,6 +189,69 @@ struct normal_distribution { static constexpr int kReturnsCount = 2; }; +#elif defined(__MUSACC__) +template +struct uniform_distribution { + __device__ inline T operator()(murandStatePhilox4_32_10_t *state) const { + return static_cast(murand_uniform(state)); + } + static constexpr int kReturnsCount = 1; +}; + +template <> +struct uniform_distribution { + __device__ inline float4 operator()(murandStatePhilox4_32_10_t *state) const { + return murand_uniform4(state); + } + static constexpr int kReturnsCount = 4; +}; + +template <> +struct uniform_distribution { + __device__ inline double2 operator()( + murandStatePhilox4_32_10_t *state) const { + return murand_uniform2_double(state); + } + static constexpr int kReturnsCount = 2; +}; + +template <> +struct uniform_distribution { + __device__ inline uint4 operator()(murandStatePhilox4_32_10_t *state) const { + return murand4(state); + } + static constexpr int kReturnsCount = 4; +}; + +template <> +struct uniform_distribution { + __device__ inline ulonglong2 operator()( + murandStatePhilox4_32_10_t *state) const { + ulonglong2 result; + uint4 rand = murand4(state); + result.x = (uint64_t)rand.x << 32 | rand.y; + result.y = (uint64_t)rand.z << 32 | rand.w; + return result; + } + static constexpr int kReturnsCount = 2; +}; + +template <> +struct normal_distribution { + __device__ inline float4 operator()(murandStatePhilox4_32_10_t *state) const { + return murand_normal4(state); + } + static constexpr int kReturnsCount = 4; +}; + +template <> +struct normal_distribution { + __device__ inline double2 operator()( + murandStatePhilox4_32_10_t *state) const { + return murand_normal2_double(state); + } + static constexpr int kReturnsCount = 2; +}; #else template struct uniform_distribution { diff --git a/paddle/phi/kernels/funcs/eigen/.extensions.h.swp b/paddle/phi/kernels/funcs/eigen/.extensions.h.swp new file mode 100644 index 0000000000000000000000000000000000000000..e41cbfca9e327039544418de0e5a9e6d0fc9fc8d GIT binary patch literal 16384 zcmeI3Uu+yl9ml7o(AH2|1&J4m_{Emc862O_N$liojEv*7N0M`~oir__$K3Ah-N4@N zdUrNw69rHJsS=_Pq#_}~Ye+%x(n?4a)TT%f@xTMLf`|429{SK1B2;*2Lg71myY~5f z{*&d80L@9CoNx9wzu(Mve!DxfJE@PIshpxCqa}vVeT;qW_*MJ<$3|{{t;$&F`$`w@ z_9-2moH#n^OqT2kzLRaS>9<5N;I{3GS}Trt`gbvC%Hn55=m|F}ItTj}v9)AH**$N!cdpGwO=pE}O` zr^V*WC}0#Y3K#{90!9I&fKk9GU=%P47zK<1MuC5W0^DZoUiiJC7vgyTpRNC&N8wMw z8aNCFz@6abcQf`I@Ktaeym=2}&wxwd8{kXeA_&0>sDMFm<6VsX7Ca5EfNz1PKmZ;E z@a2AY!KfaBzSHUab8n_5NFc01f_JOzXu;$0$ zS?~<_9`Jz!J^{*L2;2s4VqEpuzS-wN3(l~{(N;z=dBD?i5-_FT|M$l-1z z5{pVS=eGh^tY)PnX>r9I#Ot*MbdMnC-9}nmJN6;jTMIJJcUBVq1P2? zQE+#rT;4F6q6=u(eC2`pYVFwknabRJ?fB{H9LjV#ZSNs&wbp7uC@iVBrWA&?hMQZX zW0Bo!hwk@}*Qc@wBI#=9J9K5;X1kssTspv~Vt3Au6vpEaeQ2vF+2IeJndxK$qvYnRNP{0`*GbB7>&##v+tjV8yV46v5#q%lN+g8>PkH#wH63lILWui zTcS&db;l>A5+Oz}{nlffRqNQ`WZ~}A)tvZfXN-nVvE-Wo~uG2kN+io{mMS_Gv)vCOzFQGVzZ$0p&b7U+>VCgv$_tCZ7 zaUEyU$r0D}+aglAqhkAJo~TXPXzn3wHY|mbIB+;moc=e{TOZVehsN>@>iWe9xulax z-xmIp(9m1>6!euQvi~)rX{Kxv?#1F!rK22>eh9yOm2 zTI_PK5%Y$ihJOwltM!NlB5cVh*-cH{v$dOtE6WcqoL;7fXBQV|tIL)7C0badxrOSn z%5r6)ieJZRw)zO2s8o-QkdSCbNMaRp-zXx4l$exDVJA(pB{E3XV~onyMw5&%J>M|E z>VkVTIJ-oZr9t}G>{4ZEL^tTP=W5qsXFXz5eT{g47#7ET5uIx5sJsyh1W`U_{nQSz zX7L&$6#0&7^H9Kz_M|^erGj#?*lxE+Ig+Uee;MoHA7QPm*Zh2&??ijnn};`IamH!0p7u=OYV zoTjNx9|dLNQ>^RxQu7%Q6vwB06jCT4pb|(N9xjTK%6J>Mq>Yz!qH zHcmhV2?=rF$PI+V4Q_BmNWcM*%8@Gw!2u*d;=;!T3B-Ta>oks2+l_<}Xjb}Vv%7EJ zzWKk|nHObh2hSEy)6vWf!*Lg5)-NAAzdxD2;g6pf3#3%~_NZfK?#S%ik;ai3XVxBO zYqeyXTfXf$F2C50LOc20@>{~X!~>7Jq1A}IdT4Rc`->*pf=v0dXVn9du7aa#>^;S6fg=H1&jhl0i%FXz$jo8_&+G1{9WutbmWGlGmj_d zU0cq{r8yY|i~>dhqkvJsC}0#Y3K#{90!9I&fKk9G@E=sbb{Kp4M#etB3Blw4|GmHe zKfaT(kH8DyIM@g71>fJn*mvM9@D$hwzSzsyyWkb@G?)W_-Okvb;Aij^_!4{oUIi8K z5ZDFYzm2gsK^YtXyTK=W7<(JkKn~1-3Gm|tW1oUIfDiH@2j;*8_~ur|-UBazG9d5( zxF6gHzP*L9cYqI$0tT+^X6!W(fD*`nJ>Z9%8T$;p4m>aqCctmd=~wVI(3)KV<}eBv z1^#6PE)!W6d3KwJzFp^pJwMiRC9TL`q=nQ8m_d}SS@O?>CEgn-p4YHgB|KEb~DsMcxv|W zMK#K1=<^6pj*jqAlhTC~BE=!KtvH1$i0Xrg9?2i03p;3cc!%u{?XX>ThwWzKcKb2Xg~$5``&1%mnzVk?G;myC?6OY8Wk4h& zm2qOhr*N94<0QOHyf<%3HX0#UVVd}w;+_iQ^yxNzM_qcMqCybZ>xZ%@gqqwir{`Ip zW%pvxzW1JMTNk}5KGWEJ5uf+kB!WzDXTBV%eoNgyZ5b+Ci8`%H;n32m?M`*W*gEJY z2_mFXUHk*;T!%1|G_EG2l-P0qX}cqltKf>J$DO<$Mfyl?SGgC;;I!@czUA(X`D&WV zws$Grvn$KaY5N%&q>)9H*V&0e;WcS6dB^cgAAK(v;xDmVL?ApKDETh4Ou=1yd;>w$~fj}p?$v{KtQMUBvwo|UZ3qs)?uVw730CeU`SXbU|} zVL~Nc#RQ~UoK9ga#Bk2g0gF{Q-+Yf6vTGCiye=A;-(1^kMs|}^Q?6pqwf?Gn9<)Um z#$IjDVdsj~$Cg*Bbgob?>k4V9Ld#`ZSS~FVb*+S}lT;|3rzeV~MNItyF$9>>{eXud z;e7(jx8P2OIRZ%)wMcPiFkBKLZ8ne|*&d||6)ILzbfQoxR?=+6bG6feKO2bvUTW)P z`|w`3QG}r{9sNGL8E`yP66vQ-AQU>4rcgFi#}2rjs)<@k`8mt#bUGOu1!9Y5WYDy9 zg$LzA>r`=JsZ?2dIGe%t*z+=?w!KcWrfhh$sV;q7HTs0L6 { static void Eval(const Eigen::DefaultDevice& dev, OutType out, const InType& in) { - out.device(dev) = in.erf(); + //out.device(dev) = in.erf(); } }; @@ -42,8 +42,8 @@ struct EigenErfGrad { OutType din, const InType& in, const InType& dout) { - din.device(dev) = - dout * static_cast(M_2_SQRTPI) * (-(in.square())).exp(); + //din.device(dev) = + // dout * static_cast(M_2_SQRTPI) * (-(in.square())).exp(); } }; diff --git a/paddle/phi/kernels/funcs/eigen/extensions.h b/paddle/phi/kernels/funcs/eigen/extensions.h index c724564417b19..4189faea8faa9 100644 --- a/paddle/phi/kernels/funcs/eigen/extensions.h +++ b/paddle/phi/kernels/funcs/eigen/extensions.h @@ -131,7 +131,7 @@ struct NumTraits : GenericNumTraits { return phi::dtype::raw_uint16_to_float16(0x7c01); } }; - +#if 0 namespace numext { //////////// bfloat methods ///////////// @@ -435,6 +435,7 @@ HOSTDEVICE inline float16 maxi(const float16& a, const float16& b) { } } // namespace numext +#endif } // namespace Eigen #endif // __xpu__ diff --git a/paddle/phi/kernels/funcs/eigen/pad.cu b/paddle/phi/kernels/funcs/eigen/pad.cu index c4a3dd9ecc4f5..42ac4e51de261 100644 --- a/paddle/phi/kernels/funcs/eigen/pad.cu +++ b/paddle/phi/kernels/funcs/eigen/pad.cu @@ -39,7 +39,7 @@ struct EigenPad { const InType& in, const Array& padding, const T value) { - out.device(dev) = in.pad(padding, value); + //out.device(dev) = in.pad(padding, value); } static void Eval32(const Eigen::GpuDevice& dev, @@ -47,7 +47,7 @@ struct EigenPad { const InType32BitIndex& in, const Array32Bit& padding, const T value) { - out.device(dev) = in.pad(padding, value); + //out.device(dev) = in.pad(padding, value); } }; diff --git a/paddle/phi/kernels/funcs/eigen/slice.cu b/paddle/phi/kernels/funcs/eigen/slice.cu index ade58d0698759..64d7e243bc38d 100644 --- a/paddle/phi/kernels/funcs/eigen/slice.cu +++ b/paddle/phi/kernels/funcs/eigen/slice.cu @@ -39,7 +39,7 @@ struct EigenSlice { const InType& in, const Array& offsets, const Array& extents) { - out.device(dev) = in.slice(offsets, extents); + //out.device(dev) = in.slice(offsets, extents); } static void Eval(const Eigen::GpuDevice& dev, @@ -47,7 +47,7 @@ struct EigenSlice { const InType32BitIndex& in, const Array32Bit& offsets, const Array32Bit& extents) { - out.device(dev) = in.slice(offsets, extents); + //out.device(dev) = in.slice(offsets, extents); } }; diff --git a/paddle/phi/kernels/funcs/elementwise_base.h b/paddle/phi/kernels/funcs/elementwise_base.h index 274ac1cc32c05..683696f810c80 100644 --- a/paddle/phi/kernels/funcs/elementwise_base.h +++ b/paddle/phi/kernels/funcs/elementwise_base.h @@ -22,7 +22,7 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/elementwise_utils.h" #include "paddle/phi/kernels/funcs/math_function.h" -#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || defined(__xpu__) #include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/kernels/funcs/aligned_vector.h" #include "paddle/phi/kernels/funcs/function_traits.h" @@ -151,7 +151,7 @@ class MidWiseTransformIterator int64_t post_; }; -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) template class RowwiseTransformIterator : public thrust::iterator_adaptor, @@ -486,7 +486,7 @@ inline void ElementwiseGradPreProcess(const DenseTensor &dout, } } -#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || defined(__xpu__) // static unroller template