diff --git a/paddle/phi/backends/gpu/musa/miopen_desc.h b/paddle/phi/backends/gpu/musa/miopen_desc.h
deleted file mode 100644
index ae0e274ca650e..0000000000000
--- a/paddle/phi/backends/gpu/musa/miopen_desc.h
+++ /dev/null
@@ -1,264 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <algorithm>
-#include <functional>
-#include <iostream>
-#include <iterator>
-#include <memory>
-#include <numeric>
-#include <string>
-#include <vector>
-
-#include "paddle/phi/backends/gpu/rocm/miopen_helper.h"
-#include "paddle/phi/core/utils/data_type.h"
-
-namespace phi {
-namespace backends {
-namespace gpu {
-
-inline std::vector<int> TransformDimOrder(const std::vector<int>& dims) {
-  std::vector<int> transformed_dims(dims.begin(), dims.end());
-  int H, W, D, C;
-  if (dims.size() == 4) {
-    H = dims[1];
-    W = dims[2];
-    C = dims[3];
-    transformed_dims[1] = C;
-    transformed_dims[2] = H;
-    transformed_dims[3] = W;
-  } else {
-    D = dims[1];
-    H = dims[2];
-    W = dims[3];
-    C = dims[4];
-    transformed_dims[1] = C;
-    transformed_dims[2] = D;
-    transformed_dims[3] = H;
-    transformed_dims[4] = W;
-  }
-  return transformed_dims;
-}
-
-inline miopenDataType_t ToCudnnDataType(const phi::DataType& t) {
-  miopenDataType_t type = miopenFloat;
-  switch (t) {
-    case phi::DataType::FLOAT16:
-      type = miopenHalf;
-      break;
-    case phi::DataType::FLOAT32:
-      type = miopenFloat;
-      break;
-    default:
-      break;
-  }
-  return type;
-}
-
-class ActivationDescriptor {
- public:
-  using T = miopenActivationDescriptor;
-  struct Deleter {
-    void operator()(T* t) {
-      if (t != nullptr) {
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::miopenDestroyActivationDescriptor(t));
-        t = nullptr;
-      }
-    }
-  };
-  ActivationDescriptor() {
-    T* raw_ptr;
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenCreateActivationDescriptor(&raw_ptr));
-    desc_.reset(raw_ptr);
-  }
-  template <typename T>
-  void set(miopenActivationMode_t mode, const T& coef) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetActivationDescriptor(
-        desc_.get(), mode, static_cast<double>(coef), 0.0, 0.0));
-  }
-
-  T* desc() { return desc_.get(); }
-  T* desc() const { return desc_.get(); }
-
- private:
-  std::unique_ptr<T, Deleter> desc_;
-};
-
-class TensorDescriptor {
- public:
-  using T = miopenTensorDescriptor;
-  struct Deleter {
-    void operator()(T* t) {
-      if (t != nullptr) {
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::miopenDestroyTensorDescriptor(t));
-        t = nullptr;
-      }
-    }
-  };
-  TensorDescriptor() {
-    T* raw_ptr;
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenCreateTensorDescriptor(&raw_ptr));
-    desc_.reset(raw_ptr);
-  }
-  T* desc() { return desc_.get(); }
-  T* desc() const { return desc_.get(); }
-
-  void set(const phi::DenseTensor& tensor, const int groups = 1) {
-    auto dims = phi::vectorize<int>(tensor.dims());
-    std::vector<int> strides(dims.size());
-    strides[dims.size() - 1] = 1;
-    for (int i = dims.size() - 2; i >= 0; i--) {
-      strides[i] = dims[i + 1] * strides[i + 1];
-    }
-    std::vector<int> dims_with_group(dims.begin(), dims.end());
-    if (groups > 1) {
-      dims_with_group[1] = dims_with_group[1] / groups;
-    }
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor(
-        (miopenTensorDescriptor_t)(desc_.get()),
-        ToCudnnDataType(tensor.dtype()),
-        static_cast<int>(dims_with_group.size()),
-        const_cast<int*>(dims_with_group.data()),
-        const_cast<int*>(strides.data())));
-  }
-
-  void set(const phi::DenseTensor& tensor, const miopenTensorFormat_t format) {
-    const int groups = 1;
-    PADDLE_ENFORCE_EQ(
-        format,
-        MIOPEN_TENSOR_NCHW,
-        phi::errors::InvalidArgument("format should ONLY be NCHW in MIOPEN."));
-    auto dims = phi::vectorize<int>(tensor.dims());
-    std::vector<int> strides(dims.size());
-    strides[dims.size() - 1] = 1;
-    for (int i = dims.size() - 2; i >= 0; i--) {
-      strides[i] = dims[i + 1] * strides[i + 1];
-    }
-    std::vector<int> dims_with_group(dims.begin(), dims.end());
-    if (groups > 1) {
-      dims_with_group[1] = dims_with_group[1] / groups;
-    }
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor(
-        (miopenTensorDescriptor_t)(desc_.get()),
-        ToCudnnDataType(tensor.dtype()),
-        static_cast<int>(dims_with_group.size()),
-        const_cast<int*>(dims_with_group.data()),
-        const_cast<int*>(strides.data())));
-  }
-
- private:
-  std::unique_ptr<T, Deleter> desc_;
-};
-
-class FilterDescriptor {
- public:
-  using T = miopenTensorDescriptor;
-  struct Deleter {
-    void operator()(T* t) {
-      if (t != nullptr) {
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::miopenDestroyTensorDescriptor(t));
-        t = nullptr;
-      }
-    }
-  };
-  FilterDescriptor() {
-    T* raw_ptr;
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenCreateTensorDescriptor(&raw_ptr));
-    desc_.reset(raw_ptr);
-  }
-  T* desc() { return desc_.get(); }
-  T* desc() const { return desc_.get(); }
-
-  void set(const phi::DenseTensor& tensor,
-           const miopenTensorFormat_t format,
-           const int groups = 1) {
-    PADDLE_ENFORCE_EQ(
-        format,
-        MIOPEN_TENSOR_NCHW,
-        phi::errors::InvalidArgument("format should ONLY be NCHW in MIOPEN."));
-    auto dims = phi::vectorize<int>(tensor.dims());
-    std::vector<int> strides(dims.size());
-    strides[dims.size() - 1] = 1;
-    for (int i = dims.size() - 2; i >= 0; i--) {
-      strides[i] = dims[i + 1] * strides[i + 1];
-    }
-    std::vector<int> dims_with_group(dims.begin(), dims.end());
-    if (groups > 1) {
-      dims_with_group[1] = dims_with_group[1] / groups;
-    }
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor(
-        (miopenTensorDescriptor_t)(desc_.get()),
-        ToCudnnDataType(tensor.dtype()),
-        static_cast<int>(dims_with_group.size()),
-        const_cast<int*>(dims_with_group.data()),
-        const_cast<int*>(strides.data())));
-  }
-
- private:
-  std::unique_ptr<T, Deleter> desc_;
-};
-
-class ConvolutionDescriptor {
- public:
-  using T = miopenConvolutionDescriptor;
-  struct Deleter {
-    void operator()(T* t) {
-      if (t != nullptr) {
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::miopenDestroyConvolutionDescriptor(t));
-        t = nullptr;
-      }
-    }
-  };
-  ConvolutionDescriptor() {
-    T* raw_ptr;
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenCreateConvolutionDescriptor(&raw_ptr));
-    desc_.reset(raw_ptr);
-  }
-  T* desc() { return desc_.get(); }
-  T* desc() const { return desc_.get(); }
-
-  void set(miopenDataType_t dtype,
-           const std::vector<int>& pads,
-           const std::vector<int>& strides,
-           const std::vector<int>& dilations,
-           bool allow_tf32,
-           const int groups = 1) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenInitConvolutionNdDescriptor(
-        (miopenConvolutionDescriptor_t)desc_.get(),
-        static_cast<int>(pads.size()),
-        const_cast<int*>(pads.data()),
-        const_cast<int*>(strides.data()),
-        const_cast<int*>(dilations.data()),
-        miopenConvolution));
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetConvolutionGroupCount(
-        (miopenConvolutionDescriptor_t)desc_.get(), groups));
-  }
-
- private:
-  std::unique_ptr<T, Deleter> desc_;
-};
-
-}  // namespace gpu
-}  // namespace backends
-}  // namespace phi
diff --git a/paddle/phi/backends/gpu/musa/miopen_helper.h b/paddle/phi/backends/gpu/musa/miopen_helper.h
deleted file mode 100644
index 095f32ba460d0..0000000000000
--- a/paddle/phi/backends/gpu/musa/miopen_helper.h
+++ /dev/null
@@ -1,595 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "gflags/gflags.h"
-
-#include "paddle/phi/backends/dynload/miopen.h"
-#include "paddle/phi/common/bfloat16.h"
-#include "paddle/phi/common/float16.h"
-#include "paddle/phi/common/place.h"
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
-#include "paddle/phi/core/macros.h"
-
-// MIOPEN do not have epslion definition
-#define CUDNN_BN_MIN_EPSILON 1e-05
-
-DECLARE_bool(cudnn_deterministic);
-
-namespace phi {
-namespace backends {
-namespace gpu {
-
-inline const char* miopenGetErrorString(miopenStatus_t status) {
-  switch (status) {
-    case miopenStatusSuccess:
-      return "miopenStatusSuccess";
-    case miopenStatusNotInitialized:
-      return "miopenStatusNotInitialized";
-    case miopenStatusAllocFailed:
-      return "miopenStatusAllocFailed";
-    case miopenStatusBadParm:
-      return "miopenStatusBadParm";
-    case miopenStatusInternalError:
-      return "miopenStatusInternalError";
-    case miopenStatusInvalidValue:
-      return "miopenStatusInvalidValue";
-    case miopenStatusUnknownError:
-      return "miopenStatusUnknownError";
-    case miopenStatusNotImplemented:
-      return "miopenStatusNotImplemented";
-    default:
-      return "Unknown miopen error number";
-  }
-}
-
-// no use, but will have compiling error if not defined
-#define CUDNN_VERSION_MIN(major, minor, patch) \
-  (CUDNN_VERSION >= ((major)*1000 + (minor)*100 + (patch)))
-
-enum class DataLayout {  // Not use
-  kNHWC,
-  kNCHW,
-  kNCDHW,
-  kNDHWC,  // add, liyamei
-  kNCHW_VECT_C,
-};
-
-enum class PoolingMode {
-  kMaximum,
-  kMaximumDeterministic,
-  kAverageExclusive,
-  kAverageInclusive,
-};
-
-enum class ActivationMode {
-  kNone,  // activation identity
-  kSigmoid,
-  kRelu,
-  kRelu6,
-  kReluX,
-  kTanh,
-  kBandPass,
-};
-
-inline miopenPoolingMode_t GetPoolingMode(const PoolingMode& mode) {
-  switch (mode) {
-    case PoolingMode::kMaximumDeterministic:
-      return miopenPoolingMax;
-    case PoolingMode::kAverageExclusive:
-      return miopenPoolingAverage;
-    case PoolingMode::kAverageInclusive:
-      return miopenPoolingAverageInclusive;
-    case PoolingMode::kMaximum:
-      return miopenPoolingMax;
-    default:
-      PADDLE_THROW(
-          phi::errors::Unimplemented("Unexpected MIOPEN pooling mode."));
-  }
-}
-
-inline ActivationMode StringToActivationMode(const std::string& str) {
-  if (str == "identity") {
-    return ActivationMode::kNone;
-  } else if (str == "sigmoid") {
-    return ActivationMode::kSigmoid;
-  } else if (str == "relu") {
-    return ActivationMode::kRelu;
-  } else if (str == "relu6") {
-    return ActivationMode::kRelu6;
-  } else if (str == "relux") {
-    return ActivationMode::kReluX;
-  } else if (str == "tanh") {
-    return ActivationMode::kTanh;
-  } else if (str == "bandpass") {
-    return ActivationMode::kBandPass;
-  } else {
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "Unknown MIOPEN activation string: %s.", str));
-  }
-}
-
-template <typename T>
-class CudnnDataType;
-
-template <>
-class CudnnDataType<phi::dtype::float16> {
- public:
-  static const miopenDataType_t type = miopenHalf;
-  // The scaling param type is float for HALF and FLOAT tensors
-  using ScalingParamType = const float;
-  using BatchNormParamType = float;
-  static ScalingParamType* kOne() {
-    static ScalingParamType v = 1.0;
-    return &v;
-  }
-  static ScalingParamType* kZero() {
-    static ScalingParamType v = 0.0;
-    return &v;
-  }
-};
-
-template <>
-class CudnnDataType<phi::dtype::bfloat16> {
- public:
-  static const miopenDataType_t type = miopenBFloat16;
-  // The scaling param type is float for HALF and FLOAT tensors
-  using ScalingParamType = const float;
-  using BatchNormParamType = float;
-  static ScalingParamType* kOne() {
-    static ScalingParamType v = 1.0;
-    return &v;
-  }
-  static ScalingParamType* kZero() {
-    static ScalingParamType v = 0.0;
-    return &v;
-  }
-};
-
-template <>
-class CudnnDataType<float> {
- public:
-  static const miopenDataType_t type = miopenFloat;
-  using ScalingParamType = const float;
-  using BatchNormParamType = float;
-  static ScalingParamType* kOne() {
-    static ScalingParamType v = 1.0;
-    return &v;
-  }
-  static ScalingParamType* kZero() {
-    static ScalingParamType v = 0.0;
-    return &v;
-  }
-};
-
-inline miopenTensorFormat_t GetCudnnTensorFormat(const DataLayout& order) {
-  switch (order) {
-    case DataLayout::kNHWC:
-      return MIOPEN_TENSOR_NHWC;
-    case DataLayout::kNCHW:
-      return MIOPEN_TENSOR_NCHW;
-    case DataLayout::kNCDHW:
-      return MIOPEN_TENSOR_NCHW;
-    case DataLayout::kNDHWC:
-      return MIOPEN_TENSOR_NHWC;
-    default:
-      PADDLE_THROW(phi::errors::Unimplemented(
-          "MIOPEN has no equivalent dataLayout for input order."));
-  }
-  return MIOPEN_TENSOR_NCHW;
-}
-
-class ScopedTensorDescriptor {
- public:
-  ScopedTensorDescriptor() {
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenCreateTensorDescriptor(&desc_));
-  }
-  ~ScopedTensorDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenDestroyTensorDescriptor(desc_));
-  }
-
-  inline miopenTensorDescriptor_t descriptor(const miopenTensorFormat_t format,
-                                             const miopenDataType_t type,
-                                             const std::vector<int>& dims,
-                                             const int groups = 1) {
-    // the format is not used now, will add later
-    std::vector<int> strides(dims.size());
-    strides[dims.size() - 1] = 1;
-    for (int i = dims.size() - 2; i >= 0; i--) {
-      strides[i] = dims[i + 1] * strides[i + 1];
-    }
-    // Update tensor descriptor dims setting if groups > 1
-    // NOTE: Here, Assume using NCHW or NCDHW order
-    std::vector<int> dims_with_group(dims.begin(), dims.end());
-    if (groups > 1) {
-      dims_with_group[1] = dims_with_group[1] / groups;
-    }
-
-    // MIOPEN ONLY support data layout of NCHW
-    PADDLE_ENFORCE_EQ(
-        format,
-        MIOPEN_TENSOR_NCHW,
-        phi::errors::InvalidArgument("format should ONLY be NCHW in MIOPEN."));
-    if (dims.size() == 4) {
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor(
-          desc_,
-          type,
-          dims_with_group.size(),
-          const_cast<int*>(dims_with_group.data()),
-          const_cast<int*>(strides.data())));
-    } else if (dims.size() == 5) {
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor(
-          desc_,
-          type,
-          dims_with_group.size(),
-          const_cast<int*>(dims_with_group.data()),
-          const_cast<int*>(strides.data())));
-    }
-    return desc_;
-  }
-
-  template <typename T>
-  inline miopenTensorDescriptor_t descriptor(const DataLayout& order,
-                                             const std::vector<int>& dims,
-                                             const int groups = 1) {
-    return descriptor(
-        GetCudnnTensorFormat(order), CudnnDataType<T>::type, dims, groups);
-  }
-
-  inline miopenTensorDescriptor_t descriptor(const miopenDataType_t miopen_type,
-                                             const std::vector<int>& dim,
-                                             const std::vector<int>& stride) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor(
-        desc_,
-        miopen_type,
-        dim.size(),
-        const_cast<int*>(dim.data()),
-        const_cast<int*>(stride.data())));
-    return desc_;
-  }
-
-  template <typename T>
-  inline miopenTensorDescriptor_t descriptor(const std::vector<int>& dim,
-                                             const std::vector<int>& stride) {
-    return descriptor(CudnnDataType<T>::type, dim, stride);
-  }
-
-  inline miopenTensorDescriptor_t desc() { return desc_; }
-
- private:
-  miopenTensorDescriptor_t desc_;
-  DISABLE_COPY_AND_ASSIGN(ScopedTensorDescriptor);
-};
-
-class ScopedDropoutDescriptor {
- public:
-  ScopedDropoutDescriptor() {
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenCreateDropoutDescriptor(&desc_));
-  }
-  ~ScopedDropoutDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenDestroyDropoutDescriptor(desc_));
-  }
-
-  inline miopenDropoutDescriptor_t descriptor(const miopenHandle_t& handle,
-                                              const phi::Place& place,
-                                              bool initialized,
-                                              float dropout_prob_,
-                                              phi::DenseTensor* dropout_state_,
-                                              int seed,
-                                              size_t state_size) {
-    if (dropout_state_ == nullptr) {  // for no dropout or test
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::miopenSetDropoutDescriptor(desc_,
-                                                   handle,
-                                                   0 /* dropout */,
-                                                   nullptr,
-                                                   0 /* state_size */,
-                                                   0 /* seed */,
-                                                   false,
-                                                   false,
-                                                   MIOPEN_RNG_PSEUDO_XORWOW));
-      return desc_;
-    }
-    auto* dropout_state_data = dropout_state_->data<uint8_t>();
-    if (!initialized) {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::miopenSetDropoutDescriptor(desc_,
-                                                   handle,
-                                                   dropout_prob_,
-                                                   dropout_state_data,
-                                                   state_size,
-                                                   seed,
-                                                   false,
-                                                   false,
-                                                   MIOPEN_RNG_PSEUDO_XORWOW));
-    } else {
-      auto dropout_state_dims = dropout_state_->dims();
-      state_size = dropout_state_dims[0];
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenRestoreDropoutDescriptor(
-          desc_,
-          handle,
-          dropout_prob_,
-          dropout_state_data,
-          state_size,
-          0,
-          false,
-          false,
-          MIOPEN_RNG_PSEUDO_XORWOW));
-    }
-    return desc_;
-  }
-  inline miopenDropoutDescriptor_t desc() { return desc_; }
-
- private:
-  miopenDropoutDescriptor_t desc_;
-  DISABLE_COPY_AND_ASSIGN(ScopedDropoutDescriptor);
-};
-
-class ScopedRNNDescriptor {
- public:
-  ScopedRNNDescriptor() {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenCreateRNNDescriptor(&desc_));
-  }
-  ~ScopedRNNDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDestroyRNNDescriptor(desc_));
-  }
-
-  inline miopenRNNDescriptor_t desc() { return desc_; }
-
- private:
-  miopenRNNDescriptor_t desc_;
-  DISABLE_COPY_AND_ASSIGN(ScopedRNNDescriptor);
-};
-
-class ScopedFilterDescriptor {
- public:
-  ScopedFilterDescriptor() {
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenCreateTensorDescriptor(&desc_));
-  }
-  ~ScopedFilterDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenDestroyTensorDescriptor(desc_));
-  }
-
-  inline miopenTensorDescriptor_t descriptor(const miopenTensorFormat_t format,
-                                             const miopenDataType_t type,
-                                             const std::vector<int>& kernel,
-                                             const int groups = 1) {
-    // filter layout: MCHW(MCDHW), where M is the number of
-    // output image channels, C is the number of input image channels,
-    // D is the depth of the filter, H is the height of the filter, and W is the
-    // width of the filter.
-    std::vector<int> kernel_with_group(kernel.begin(), kernel.end());
-    if (groups > 1) {
-      kernel_with_group[0] /= groups;
-      // NOTE: input filter(C) of the filter is already asserted to be C/groups.
-    }
-    std::vector<int> stride_dim(kernel_with_group.size());
-    stride_dim.push_back(1);
-    for (int k = kernel_with_group.size() - 2; k >= 0; k--) {
-      stride_dim[k] = stride_dim[k + 1] * kernel_with_group[k + 1];
-    }
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor(
-        desc_,
-        type,
-        kernel_with_group.size(),
-        const_cast<int*>(kernel_with_group.data()),
-        const_cast<int*>(stride_dim.data())));
-    return desc_;
-  }
-
-  template <typename T>
-  inline miopenTensorDescriptor_t descriptor(const DataLayout& order,
-                                             const std::vector<int>& kernel,
-                                             const int groups = 1) {
-    return descriptor(
-        GetCudnnTensorFormat(order), CudnnDataType<T>::type, kernel, groups);
-  }
-
-  inline miopenTensorDescriptor_t desc() { return desc_; }
-
- private:
-  miopenTensorDescriptor_t desc_;
-  DISABLE_COPY_AND_ASSIGN(ScopedFilterDescriptor);
-};
-
-class ScopedConvolutionDescriptor {
- public:
-  ScopedConvolutionDescriptor() {
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenCreateConvolutionDescriptor(&desc_));
-  }
-  ~ScopedConvolutionDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenDestroyConvolutionDescriptor(desc_));
-  }
-
-  inline miopenConvolutionDescriptor_t descriptor(
-      miopenDataType_t type,
-      const std::vector<int>& pads,
-      const std::vector<int>& strides,
-      const std::vector<int>& dilations) {
-    PADDLE_ENFORCE_EQ(pads.size(),
-                      strides.size(),
-                      phi::errors::InvalidArgument(
-                          "The size of pads and strides should be equal. But "
-                          "received size of pads is %d, size of strides is %d.",
-                          pads.size(),
-                          strides.size()));
-    PADDLE_ENFORCE_EQ(
-        pads.size(),
-        dilations.size(),
-        phi::errors::InvalidArgument(
-            "The size of pads and dilations should be equal. But received size "
-            "of pads is %d, size of dilations is %d.",
-            pads.size(),
-            dilations.size()));
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenInitConvolutionNdDescriptor(
-        desc_,
-        pads.size(),
-        const_cast<int*>(pads.data()),
-        const_cast<int*>(strides.data()),
-        const_cast<int*>(dilations.data()),
-        miopenConvolution));
-    return desc_;
-  }
-
-  template <typename T>
-  inline miopenConvolutionDescriptor_t descriptor(
-      const std::vector<int>& pads,
-      const std::vector<int>& strides,
-      const std::vector<int>& dilations) {
-    return descriptor(CudnnDataType<T>::type, pads, strides, dilations);
-  }
-
- private:
-  miopenConvolutionDescriptor_t desc_;
-  DISABLE_COPY_AND_ASSIGN(ScopedConvolutionDescriptor);
-};
-
-class ScopedPoolingDescriptor {
- public:
-  ScopedPoolingDescriptor() {
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenCreatePoolingDescriptor(&desc_));
-  }
-  ~ScopedPoolingDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenDestroyPoolingDescriptor(desc_));
-  }
-
-  inline miopenPoolingDescriptor_t descriptor(const PoolingMode& mode,
-                                              const std::vector<int>& kernel,
-                                              const std::vector<int>& pads,
-                                              const std::vector<int>& strides) {
-    PADDLE_ENFORCE_EQ(kernel.size(),
-                      pads.size(),
-                      phi::errors::InvalidArgument(
-                          "The size of kernel and pads should be equal. But "
-                          "received size of kernel is %d, size of pads is %d.",
-                          kernel.size(),
-                          pads.size()));
-    PADDLE_ENFORCE_EQ(
-        kernel.size(),
-        strides.size(),
-        phi::errors::InvalidArgument(
-            "The size of kernel and strides should be equal. But "
-            "received size of kernel is %d, size of strides is %d.",
-            kernel.size(),
-            strides.size()));
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetNdPoolingDescriptor(
-        desc_,
-        GetPoolingMode(mode),
-        kernel.size(),
-        const_cast<int*>(kernel.data()),
-        const_cast<int*>(pads.data()),
-        const_cast<int*>(strides.data())));
-    return desc_;
-  }
-
- private:
-  miopenPoolingDescriptor_t desc_;
-  DISABLE_COPY_AND_ASSIGN(ScopedPoolingDescriptor);
-};
-
-class ScopedActivationDescriptor {
- public:
-  ScopedActivationDescriptor() {
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenCreateActivationDescriptor(&desc_));
-  }
-  ~ScopedActivationDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenDestroyActivationDescriptor(desc_));
-  }
-
-  template <typename T>
-  inline miopenActivationDescriptor_t descriptor(
-      const std::string& act, double value_max = static_cast<double>(0.)) {
-    double relu_ceiling = 0.0;
-    ActivationMode activation_mode = StringToActivationMode(act);
-    miopenActivationMode_t mode;
-    switch (activation_mode) {
-      case ActivationMode::kNone:
-        mode = miopenActivationPASTHRU;
-        break;
-      case ActivationMode::kRelu6:
-        relu_ceiling = 6.0;
-        mode = miopenActivationCLIPPEDRELU;
-        break;
-      case ActivationMode::kReluX:
-        relu_ceiling = value_max;
-        mode = miopenActivationCLIPPEDRELU;
-        break;
-      case ActivationMode::kRelu:
-        mode = miopenActivationRELU;
-        break;
-      case ActivationMode::kSigmoid:
-        mode = miopenActivationLOGISTIC;
-        break;
-      case ActivationMode::kTanh:
-        mode = miopenActivationTANH;
-        break;
-      default:
-        PADDLE_THROW(phi::errors::Unimplemented(
-            "Unrecognized MIOPEN activation mode: %d.",
-            static_cast<int>(activation_mode)));
-    }
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetActivationDescriptor(
-        desc_, mode, relu_ceiling, 0.0, 0.0));
-    return desc_;
-  }
-
- private:
-  miopenActivationDescriptor_t desc_;
-  DISABLE_COPY_AND_ASSIGN(ScopedActivationDescriptor);
-};
-
-class ScopedCTCLossDescriptor {
- public:
-  ScopedCTCLossDescriptor() {
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenCreateCTCLossDescriptor(&desc_));
-  }
-  ~ScopedCTCLossDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenDestroyCTCLossDescriptor(desc_));
-  }
-
-  template <typename T>
-  inline miopenCTCLossDescriptor_t descriptor() {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetCTCLossDescriptor(
-        desc_, CudnnDataType<T>::type, 0, false));
-    return desc_;
-  }
-
- private:
-  miopenCTCLossDescriptor_t desc_;
-  DISABLE_COPY_AND_ASSIGN(ScopedCTCLossDescriptor);
-};
-
-}  // namespace gpu
-}  // namespace backends
-}  // namespace phi
diff --git a/paddle/phi/backends/gpu/musa/musa_info.cc b/paddle/phi/backends/gpu/musa/musa_info.cc
index 6579ce63f21f6..f2087e4d7f4fc 100644
--- a/paddle/phi/backends/gpu/musa/musa_info.cc
+++ b/paddle/phi/backends/gpu/musa/musa_info.cc
@@ -88,16 +88,15 @@ int GetGPUComputeCapability(int id) {
                                    "but received id is: %d. GPU count is: %d.",
                                    id,
                                    GetGPUDeviceCount()));
-  return 100;
-  //int major, minor;
-  //auto major_error_code = musaDeviceGetAttribute(
-  //    &major, musaDeviceAttributeComputeCapabilityMajor, id);
-  //auto minor_error_code = musaDeviceGetAttribute(
-  //    &minor, musaDeviceAttributeComputeCapabilityMinor, id);
-
-  //PADDLE_ENFORCE_GPU_SUCCESS(major_error_code);
-  //PADDLE_ENFORCE_GPU_SUCCESS(minor_error_code);
-  //return major * 100 + minor;
+  int major, minor;
+  auto major_error_code = musaDeviceGetAttribute(
+      &major, musaDevAttrComputeCapabilityMajor, id);
+  auto minor_error_code = musaDeviceGetAttribute(
+      &minor, musaDevAttrComputeCapabilityMinor, id);
+
+  PADDLE_ENFORCE_GPU_SUCCESS(major_error_code);
+  PADDLE_ENFORCE_GPU_SUCCESS(minor_error_code);
+  return major * 100 + minor;
 }
 
 int GetGPURuntimeVersion(int id) {
@@ -138,7 +137,8 @@ int GetGPUMultiProcessors(int id) {
                                    GetGPUDeviceCount()));
   int count;
   PADDLE_ENFORCE_GPU_SUCCESS(
-      musaDeviceGetAttribute(&count, musaDeviceAttributeMultiprocessorCount, id));
+                                     
+      musaDeviceGetAttribute(&count, musaDevAttrMultiProcessorCount, id));
   return count;
 }
 
@@ -152,7 +152,7 @@ int GetGPUMaxThreadsPerMultiProcessor(int id) {
                                    GetGPUDeviceCount()));
   int count;
   PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceGetAttribute(
-      &count, musaDeviceAttributeMaxThreadsPerMultiProcessor, id));
+      &count, musaDevAttrMaxThreadsPerMultiProcessor, id));
 
   return count;
 }
@@ -167,7 +167,7 @@ int GetGPUMaxThreadsPerBlock(int id) {
                                    GetGPUDeviceCount()));
   int count;
   PADDLE_ENFORCE_GPU_SUCCESS(
-      musaDeviceGetAttribute(&count, musaDeviceAttributeMaxThreadsPerBlock, id));
+      musaDeviceGetAttribute(&count, musaDevAttrMaxThreadsPerBlock, id));
   return count;
 }
 
@@ -188,17 +188,17 @@ std::array<int, 3> GetGpuMaxGridDimSize(int id) {
   std::array<int, 3> ret;
   int size;
   auto error_code_x =
-      musaDeviceGetAttribute(&size, musaDeviceAttributeMaxGridDimX, id);
+      musaDeviceGetAttribute(&size, musaDevAttrMaxGridDimX, id);
   PADDLE_ENFORCE_GPU_SUCCESS(error_code_x);
   ret[0] = size;
 
   auto error_code_y =
-      musaDeviceGetAttribute(&size, musaDeviceAttributeMaxGridDimY, id);
+      musaDeviceGetAttribute(&size, musaDevAttrMaxGridDimY, id);
   PADDLE_ENFORCE_GPU_SUCCESS(error_code_y);
   ret[1] = size;
 
   auto error_code_z =
-      musaDeviceGetAttribute(&size, musaDeviceAttributeMaxGridDimZ, id);
+      musaDeviceGetAttribute(&size, musaDevAttrMaxGridDimZ, id);
   PADDLE_ENFORCE_GPU_SUCCESS(error_code_z);
   ret[2] = size;
   return ret;
diff --git a/paddle/phi/backends/gpu/musa/rocm_device_function.h b/paddle/phi/backends/gpu/musa/rocm_device_function.h
deleted file mode 100644
index 6f5d684075f0f..0000000000000
--- a/paddle/phi/backends/gpu/musa/rocm_device_function.h
+++ /dev/null
@@ -1,165 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-// NOTE(): support float16 to half in header file.
-#define PADDLE_CUDA_FP16
-#include "paddle/phi/common/bfloat16.h"
-#include "paddle/phi/common/complex.h"
-#include "paddle/phi/common/float16.h"
-
-namespace phi {
-namespace backends {
-namespace gpu {
-
-#define CREATE_SHFL_MASK(mask, predicate) mask = __ballot((predicate))
-
-#define CUDA_LAUNCH_KERNEL_BASE(dim, ...)  \
-  case (dim): {                            \
-    constexpr auto kPowerOfTwoDim = (dim); \
-    __VA_ARGS__;                           \
-  } break
-
-#define CUDA_LAUNCH_KERNEL_HELPER(...)          \
-  CUDA_LAUNCH_KERNEL_BASE(1024, ##__VA_ARGS__); \
-  CUDA_LAUNCH_KERNEL_BASE(512, ##__VA_ARGS__);  \
-  CUDA_LAUNCH_KERNEL_BASE(256, ##__VA_ARGS__);  \
-  CUDA_LAUNCH_KERNEL_BASE(128, ##__VA_ARGS__);  \
-  CUDA_LAUNCH_KERNEL_BASE(64, ##__VA_ARGS__);   \
-  CUDA_LAUNCH_KERNEL_BASE(32, ##__VA_ARGS__);
-
-template <typename T>
-__forceinline__ __device__ T
-CudaShuffleDownSync(unsigned mask, T val, int delta, int width = warpSize) {
-  return __shfl_down(val, delta, width);
-}
-
-template <typename T>
-__forceinline__ __device__ T CudaShuffleXorSync(unsigned mask,
-                                                T val,
-                                                int width = warpSize) {
-  return __shfl_xor(val, width);
-}
-
-template <>
-__forceinline__ __device__ phi::dtype::float16 CudaShuffleDownSync(
-    unsigned mask, phi::dtype::float16 val, int delta, int width) {
-  return phi::dtype::float16(__shfl_down(
-      static_cast<float>(val), static_cast<unsigned>(delta), width));
-}
-
-template <>
-__forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleDownSync(
-    unsigned mask, phi::dtype::bfloat16 val, int delta, int width) {
-  return phi::dtype::bfloat16(__shfl_down(
-      static_cast<float>(val), static_cast<unsigned>(delta), width));
-}
-
-template <>
-__forceinline__ __device__ phi::dtype::complex<float> CudaShuffleDownSync(
-    unsigned mask, phi::dtype::complex<float> val, int delta, int width) {
-  float real = __shfl_down(val.real, delta, width);
-  float imag = __shfl_down(val.imag, delta, width);
-  return phi::dtype::complex<float>(real, imag);
-}
-
-template <>
-__forceinline__ __device__ phi::dtype::complex<double> CudaShuffleDownSync(
-    unsigned mask, phi::dtype::complex<double> val, int delta, int width) {
-  double real = __shfl_down(val.real, delta, width);
-  double imag = __shfl_down(val.imag, delta, width);
-  return phi::dtype::complex<double>(real, imag);
-}
-
-template <>
-__forceinline__ __device__ phi::dtype::float16 CudaShuffleXorSync(
-    unsigned mask, phi::dtype::float16 val, int width) {
-  return phi::dtype::float16(__shfl_xor(static_cast<float>(val), width));
-}
-
-template <>
-__forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleXorSync(
-    unsigned mask, phi::dtype::bfloat16 val, int width) {
-  return phi::dtype::bfloat16(__shfl_xor(static_cast<float>(val), width));
-}
-
-template <>
-__forceinline__ __device__ phi::dtype::complex<float> CudaShuffleXorSync(
-    unsigned mask, phi::dtype::complex<float> val, int width) {
-  float real = __shfl_xor(val.real, width);
-  float imag = __shfl_xor(val.imag, width);
-  return phi::dtype::complex<float>(real, imag);
-}
-
-template <>
-__forceinline__ __device__ phi::dtype::complex<double> CudaShuffleXorSync(
-    unsigned mask, phi::dtype::complex<double> val, int width) {
-  double real = __shfl_xor(val.real, width);
-  double imag = __shfl_xor(val.imag, width);
-  return phi::dtype::complex<double>(real, imag);
-}
-
-template <typename T>
-__forceinline__ __device__ T
-CudaShuffleSync(unsigned mask, T val, int src_line, int width = 32) {
-  return __shfl(val, src_line, width);
-}
-
-template <typename T>
-HOSTDEVICE T Infinity() {
-  return INFINITY;
-}
-
-template <typename T>
-__device__ T reduceSum(T val, int tid, int len) {
-  // NOTE(zcd): The warp size should be taken from the
-  // parameters of the GPU but not specified as 32 simply.
-  // To make the reduceSum more efficiently,
-  // I use Warp-Level Parallelism and assume the Warp size
-  // is 32 which may be different for different GPU,
-  // but most card's warp size is 32.
-#ifdef PADDLE_WITH_HIP
-  const int warpSize = 64;
-#else
-  const int warpSize = 32;
-#endif
-  __shared__ T shm[warpSize];
-  unsigned mask = 0u;
-  CREATE_SHFL_MASK(mask, tid < len);
-
-  for (int offset = warpSize / 2; offset > 0; offset /= 2)
-    val += phi::backends::gpu::CudaShuffleDownSync(mask, val, offset);
-
-  if (tid < warpSize) shm[tid] = 0;
-  __syncthreads();
-
-  if (tid % warpSize == 0) {
-    shm[tid / warpSize] = val;
-  }
-  __syncthreads();
-
-  CREATE_SHFL_MASK(mask, tid < warpSize);
-
-  if (tid < warpSize) {
-    val = shm[tid];
-    for (int offset = warpSize / 2; offset > 0; offset /= 2)
-      val += phi::backends::gpu::CudaShuffleDownSync(mask, val, offset);
-  }
-  return val;
-}
-
-}  // namespace gpu
-}  // namespace backends
-}  // namespace phi
diff --git a/paddle/phi/backends/gpu/musa/rocm_helper.h b/paddle/phi/backends/gpu/musa/rocm_helper.h
deleted file mode 100644
index 07fdde5a2f417..0000000000000
--- a/paddle/phi/backends/gpu/musa/rocm_helper.h
+++ /dev/null
@@ -1,74 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-namespace phi {
-namespace backends {
-namespace gpu {
-
-/*
- * Summary: Grid stride looping macro in CUDA kernel
- *
- *  [ Why need this macro? ]
- *
- *    The original looping in CUDA kernel is:
- *
- *    `for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
- *        i += blockDim.x * gridDim.x)`
- *
- *    This for condition is risky. The value of `blockIdx.x * blockDim.x`
- *    may be large, such as over 1GB, the first iteration is no problem here,
- *    but when `i += blockDim.x * gridDim.x` is executed, the value of i
- *    will greater than INT_MAX and overflow becomes negative value, at
- *    this time, the cycle condition `i < (n)` is still satisfied, so it
- *    will cause illegal access to cuda memory.
- *
- *    Here is a real example in ERINE, it will trigger above error.
- *    The related data are:
- *      - blockIdx.x = 2172938
- *      - blockDim.x = 512
- *      - blockIdx.x * blockDim.x = 1112543864
- *      - INT_MAX = 2147483647
- *
- *    So we polish the for condition as follow, the int64_t __index__ will
- *    prevent overflow in the loop increment.
- *
- * Parameters:
- *    - i: loop index
- *    - num: total element numbers
- *
- * Examples:
- *    template <typename T>
- *    __global__ void Scale(T* logit_grad, const T* loss_grad, const int num,
- *                      const int d, const int remain) {
- *    CUDA_KERNEL_LOOP(index, num) {
- *      int idx_n = index / d;
- *      int idx_remain = index % remain;
- *      logit_grad[index] *= loss_grad[idx_n * remain + idx_remain];
- *      }
- *    }
- *
- */
-
-#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)                           \
-  int64_t __index__ =                                                       \
-      static_cast<int64_t>(hipBlockIdx_x) * hipBlockDim_x + hipThreadIdx_x; \
-  int64_t __stride__ = static_cast<int64_t>(hipBlockDim_x) * hipGridDim_x;  \
-  for (index_type i = __index__; __index__ < (num);                         \
-       __index__ += __stride__, i = __index__)
-
-}  // namespace gpu
-}  // namespace backends
-}  // namespace phi
diff --git a/paddle/phi/backends/gpu/musa/rocm_info.cc b/paddle/phi/backends/gpu/musa/rocm_info.cc
deleted file mode 100644
index 32c7c329253b1..0000000000000
--- a/paddle/phi/backends/gpu/musa/rocm_info.cc
+++ /dev/null
@@ -1,334 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <array>
-
-#include "paddle/phi/backends/gpu/gpu_info.h"
-
-#include "paddle/phi/core/enforce.h"
-
-static std::once_flag g_device_props_size_init_flag;
-static std::vector<std::unique_ptr<std::once_flag>> g_device_props_init_flags;
-static std::vector<phi::gpuDeviceProp> g_device_props;
-
-namespace phi {
-namespace backends {
-namespace gpu {
-
-int DnnVersion() {
-  if (!dynload::HasCUDNN()) return -1;
-  size_t version_major, version_minor, version_patch;
-  PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenGetVersion(
-      &version_major, &version_minor, &version_patch));
-  return version_major * 100 + version_minor * 10 + version_patch;
-}
-
-static int GetGPUDeviceCountImpl() {
-  int driverVersion = 0;
-  musaError_t status = musaDriverGetVersion(&driverVersion);
-
-  if (!(status == gpuSuccess && driverVersion != 0)) {
-    // No GPU driver
-    VLOG(2) << "GPU Driver Version can't be detected. No GPU driver!";
-    return 0;
-  }
-
-  const auto *cuda_visible_devices = std::getenv("MUSA_VISIBLE_DEVICES");
-
-  if (cuda_visible_devices != nullptr) {
-    std::string cuda_visible_devices_str(cuda_visible_devices);
-    if (!cuda_visible_devices_str.empty()) {
-      cuda_visible_devices_str.erase(
-          0, cuda_visible_devices_str.find_first_not_of('\''));
-      cuda_visible_devices_str.erase(
-          cuda_visible_devices_str.find_last_not_of('\'') + 1);
-      cuda_visible_devices_str.erase(
-          0, cuda_visible_devices_str.find_first_not_of('\"'));
-      cuda_visible_devices_str.erase(
-          cuda_visible_devices_str.find_last_not_of('\"') + 1);
-    }
-    if (std::all_of(cuda_visible_devices_str.begin(),
-                    cuda_visible_devices_str.end(),
-                    [](char ch) { return ch == ' '; })) {
-      VLOG(2) << "MUSA_VISIBLE_DEVICES is set to be "
-                 "empty. No GPU detected.";
-      return 0;
-    }
-  }
-  int count;
-  PADDLE_ENFORCE_GPU_SUCCESS(musaGetDeviceCount(&count));
-  return count;
-}
-
-int GetGPUDeviceCount() {
-  // cache the count
-  static auto dev_cnt = GetGPUDeviceCountImpl();
-  return dev_cnt;
-}
-
-int GetGPUComputeCapability(int id) {
-  PADDLE_ENFORCE_LT(
-      id,
-      GetGPUDeviceCount(),
-      phi::errors::InvalidArgument("Device id must be less than GPU count, "
-                                   "but received id is: %d. GPU count is: %d.",
-                                   id,
-                                   GetGPUDeviceCount()));
-  int major, minor;
-  auto major_error_code = musaDeviceGetAttribute(
-      &major, musaDeviceAttributeComputeCapabilityMajor, id);
-  auto minor_error_code = musaDeviceGetAttribute(
-      &minor, musaDeviceAttributeComputeCapabilityMinor, id);
-
-  PADDLE_ENFORCE_GPU_SUCCESS(major_error_code);
-  PADDLE_ENFORCE_GPU_SUCCESS(minor_error_code);
-  return major * 100 + minor;
-}
-
-int GetGPURuntimeVersion(int id) {
-  PADDLE_ENFORCE_LT(
-      id,
-      GetGPUDeviceCount(),
-      phi::errors::InvalidArgument("Device id must be less than GPU count, "
-                                   "but received id is: %d. GPU count is: %d.",
-                                   id,
-                                   GetGPUDeviceCount()));
-  int runtime_version = 0;
-  PADDLE_ENFORCE_GPU_SUCCESS(musaRuntimeGetVersion(&runtime_version));
-  return runtime_version;
-}
-
-int GetGPUDriverVersion(int id) {
-  PADDLE_ENFORCE_LT(
-      id,
-      GetGPUDeviceCount(),
-      phi::errors::InvalidArgument("Device id must be less than GPU count, "
-                                   "but received id is: %d. GPU count is: %d.",
-                                   id,
-                                   GetGPUDeviceCount()));
-  int driver_version = 0;
-  PADDLE_ENFORCE_GPU_SUCCESS(musaDriverGetVersion(&driver_version));
-  return driver_version;
-}
-
-bool TensorCoreAvailable() { return false; }
-
-int GetGPUMultiProcessors(int id) {
-  PADDLE_ENFORCE_LT(
-      id,
-      GetGPUDeviceCount(),
-      phi::errors::InvalidArgument("Device id must be less than GPU count, "
-                                   "but received id is: %d. GPU count is: %d.",
-                                   id,
-                                   GetGPUDeviceCount()));
-  int count;
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      musaDeviceGetAttribute(&count, musaDeviceAttributeMultiprocessorCount, id));
-  return count;
-}
-
-int GetGPUMaxThreadsPerMultiProcessor(int id) {
-  PADDLE_ENFORCE_LT(
-      id,
-      GetGPUDeviceCount(),
-      phi::errors::InvalidArgument("Device id must be less than GPU count, "
-                                   "but received id is: %d. GPU count is: %d.",
-                                   id,
-                                   GetGPUDeviceCount()));
-  int count;
-  PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceGetAttribute(
-      &count, musaDeviceAttributeMaxThreadsPerMultiProcessor, id));
-
-  return count;
-}
-
-int GetGPUMaxThreadsPerBlock(int id) {
-  PADDLE_ENFORCE_LT(
-      id,
-      GetGPUDeviceCount(),
-      phi::errors::InvalidArgument("Device id must be less than GPU count, "
-                                   "but received id is: %d. GPU count is: %d.",
-                                   id,
-                                   GetGPUDeviceCount()));
-  int count;
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      musaDeviceGetAttribute(&count, musaDeviceAttributeMaxThreadsPerBlock, id));
-  return count;
-}
-
-int GetCurrentDeviceId() {
-  int device_id;
-  PADDLE_ENFORCE_GPU_SUCCESS(musaGetDevice(&device_id));
-  return device_id;
-}
-
-std::array<int, 3> GetGpuMaxGridDimSize(int id) {
-  PADDLE_ENFORCE_LT(
-      id,
-      GetGPUDeviceCount(),
-      phi::errors::InvalidArgument("Device id must be less than GPU count, "
-                                   "but received id is: %d. GPU count is: %d.",
-                                   id,
-                                   GetGPUDeviceCount()));
-  std::array<int, 3> ret;
-  int size;
-  auto error_code_x =
-      musaDeviceGetAttribute(&size, musaDeviceAttributeMaxGridDimX, id);
-  PADDLE_ENFORCE_GPU_SUCCESS(error_code_x);
-  ret[0] = size;
-
-  auto error_code_y =
-      musaDeviceGetAttribute(&size, musaDeviceAttributeMaxGridDimY, id);
-  PADDLE_ENFORCE_GPU_SUCCESS(error_code_y);
-  ret[1] = size;
-
-  auto error_code_z =
-      musaDeviceGetAttribute(&size, musaDeviceAttributeMaxGridDimZ, id);
-  PADDLE_ENFORCE_GPU_SUCCESS(error_code_z);
-  ret[2] = size;
-  return ret;
-}
-
-std::pair<int, int> GetGpuStreamPriorityRange() {
-  int least_priority, greatest_priority;
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      musaDeviceGetStreamPriorityRange(&least_priority, &greatest_priority));
-  return std::make_pair(least_priority, greatest_priority);
-}
-
-const gpuDeviceProp &GetDeviceProperties(int id) {
-  std::call_once(g_device_props_size_init_flag, [&] {
-    int gpu_num = 0;
-    gpu_num = GetGPUDeviceCount();
-    g_device_props_init_flags.resize(gpu_num);
-    g_device_props.resize(gpu_num);
-    for (int i = 0; i < gpu_num; ++i) {
-      g_device_props_init_flags[i] = std::make_unique<std::once_flag>();
-    }
-  });
-
-  if (id == -1) {
-    id = GetCurrentDeviceId();
-  }
-
-  if (id < 0 || id >= static_cast<int>(g_device_props.size())) {
-    PADDLE_THROW(phi::errors::OutOfRange(
-        "The device id %d is out of range [0, %d), where %d is the number of "
-        "devices on this machine. Because the device id should be greater than "
-        "or equal to zero and smaller than the number of gpus. Please input "
-        "appropriate device again!",
-        id,
-        static_cast<int>(g_device_props.size()),
-        static_cast<int>(g_device_props.size())));
-  }
-
-  std::call_once(*(g_device_props_init_flags[id]), [&] {
-    PADDLE_ENFORCE_GPU_SUCCESS(musaGetDeviceProperties(&g_device_props[id], id));
-  });
-
-  return g_device_props[id];
-}
-
-void SetDeviceId(int id) {
-  // TODO(qijun): find a better way to cache the cuda device count
-  PADDLE_ENFORCE_LT(
-      id,
-      GetGPUDeviceCount(),
-      phi::errors::InvalidArgument("Device id must be less than GPU count, "
-                                   "but received id is: %d. GPU count is: %d.",
-                                   id,
-                                   GetGPUDeviceCount()));
-  PADDLE_RETRY_CUDA_SUCCESS(musaSetDevice(id));
-}
-
-void GpuMemcpyAsync(void *dst,
-                    const void *src,
-                    size_t count,
-                    gpuMemcpyKind kind,
-                    gpuStream_t stream) {
-  PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpyAsync(dst, src, count, kind, stream));
-}
-
-void GpuMemcpySync(void *dst,
-                   const void *src,
-                   size_t count,
-                   gpuMemcpyKind kind) {
-  PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpy(dst, src, count, kind));
-}
-
-void GpuMemcpyPeerAsync(void *dst,
-                        int dst_device,
-                        const void *src,
-                        int src_device,
-                        size_t count,
-                        gpuStream_t stream) {
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      musaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream));
-}
-
-void GpuMemcpyPeerSync(
-    void *dst, int dst_device, const void *src, int src_device, size_t count) {
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      musaMemcpyPeer(dst, dst_device, src, src_device, count));
-}
-
-void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream) {
-  PADDLE_ENFORCE_GPU_SUCCESS(musaMemsetAsync(dst, value, count, stream));
-}
-
-void GpuStreamSync(gpuStream_t stream) {
-  PADDLE_ENFORCE_GPU_SUCCESS(musaStreamSynchronize(stream));
-}
-
-void GpuDestroyStream(gpuStream_t stream) {
-  PADDLE_ENFORCE_GPU_SUCCESS(musaStreamDestroy(stream));
-}
-
-void GpuDeviceSync() { PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize()); }
-
-gpuError_t GpuGetLastError() { return musaGetLastError(); }
-
-bool IsGPUManagedMemorySupported(int dev_id) {
-  PADDLE_ENFORCE_LT(
-      dev_id,
-      GetGPUDeviceCount(),
-      phi::errors::InvalidArgument("Device id must be less than GPU count, "
-                                   "but received id is: %d. GPU count is: %d.",
-                                   dev_id,
-                                   GetGPUDeviceCount()));
-  // TODO(qili93): Hygon DTK (21.04 and 22.04) not support
-  // musaDeviceAttributeManagedMemory, temporary disable by default, to be
-  // verified in next DTK release
-  return false;
-}
-
-bool IsGPUManagedMemoryOversubscriptionSupported(int dev_id) {
-  PADDLE_ENFORCE_LT(
-      dev_id,
-      GetGPUDeviceCount(),
-      phi::errors::InvalidArgument("Device id must be less than GPU count, "
-                                   "but received id is: %d. GPU count is: %d.",
-                                   dev_id,
-                                   GetGPUDeviceCount()));
-#ifdef __linux__
-  return IsGPUManagedMemorySupported(dev_id) &&
-         GetGPUComputeCapability(dev_id) >= 60;
-#else
-  return false;
-#endif
-}
-
-}  // namespace gpu
-}  // namespace backends
-}  // namespace phi