diff --git a/paddle/phi/backends/gpu/musa/miopen_desc.h b/paddle/phi/backends/gpu/musa/miopen_desc.h deleted file mode 100644 index ae0e274ca650e..0000000000000 --- a/paddle/phi/backends/gpu/musa/miopen_desc.h +++ /dev/null @@ -1,264 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "paddle/phi/backends/gpu/rocm/miopen_helper.h" -#include "paddle/phi/core/utils/data_type.h" - -namespace phi { -namespace backends { -namespace gpu { - -inline std::vector TransformDimOrder(const std::vector& dims) { - std::vector transformed_dims(dims.begin(), dims.end()); - int H, W, D, C; - if (dims.size() == 4) { - H = dims[1]; - W = dims[2]; - C = dims[3]; - transformed_dims[1] = C; - transformed_dims[2] = H; - transformed_dims[3] = W; - } else { - D = dims[1]; - H = dims[2]; - W = dims[3]; - C = dims[4]; - transformed_dims[1] = C; - transformed_dims[2] = D; - transformed_dims[3] = H; - transformed_dims[4] = W; - } - return transformed_dims; -} - -inline miopenDataType_t ToCudnnDataType(const phi::DataType& t) { - miopenDataType_t type = miopenFloat; - switch (t) { - case phi::DataType::FLOAT16: - type = miopenHalf; - break; - case phi::DataType::FLOAT32: - type = miopenFloat; - break; - default: - break; - } - return type; -} - -class ActivationDescriptor { - public: - using T = miopenActivationDescriptor; - struct Deleter { - void operator()(T* t) { - if (t != nullptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::miopenDestroyActivationDescriptor(t)); - t = nullptr; - } - } - }; - ActivationDescriptor() { - T* raw_ptr; - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::miopenCreateActivationDescriptor(&raw_ptr)); - desc_.reset(raw_ptr); - } - template - void set(miopenActivationMode_t mode, const T& coef) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetActivationDescriptor( - desc_.get(), mode, static_cast(coef), 0.0, 0.0)); - } - - T* desc() { return desc_.get(); } - T* desc() const { return desc_.get(); } - - private: - std::unique_ptr desc_; -}; - -class TensorDescriptor { - public: - using T = miopenTensorDescriptor; - struct Deleter { - void operator()(T* t) { - if (t != nullptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::miopenDestroyTensorDescriptor(t)); - t = nullptr; - } - } - }; - TensorDescriptor() { - T* raw_ptr; - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::miopenCreateTensorDescriptor(&raw_ptr)); - desc_.reset(raw_ptr); - } - T* desc() { return desc_.get(); } - T* desc() const { return desc_.get(); } - - void set(const phi::DenseTensor& tensor, const int groups = 1) { - auto dims = phi::vectorize(tensor.dims()); - std::vector strides(dims.size()); - strides[dims.size() - 1] = 1; - for (int i = dims.size() - 2; i >= 0; i--) { - strides[i] = dims[i + 1] * strides[i + 1]; - } - std::vector dims_with_group(dims.begin(), dims.end()); - if (groups > 1) { - dims_with_group[1] = dims_with_group[1] / groups; - } - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor( - (miopenTensorDescriptor_t)(desc_.get()), - ToCudnnDataType(tensor.dtype()), - static_cast(dims_with_group.size()), - const_cast(dims_with_group.data()), - const_cast(strides.data()))); - } - - void set(const phi::DenseTensor& tensor, const miopenTensorFormat_t format) { - const int groups = 1; - PADDLE_ENFORCE_EQ( - format, - MIOPEN_TENSOR_NCHW, - phi::errors::InvalidArgument("format should ONLY be NCHW in MIOPEN.")); - auto dims = phi::vectorize(tensor.dims()); - std::vector strides(dims.size()); - strides[dims.size() - 1] = 1; - for (int i = dims.size() - 2; i >= 0; i--) { - strides[i] = dims[i + 1] * strides[i + 1]; - } - std::vector dims_with_group(dims.begin(), dims.end()); - if (groups > 1) { - dims_with_group[1] = dims_with_group[1] / groups; - } - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor( - (miopenTensorDescriptor_t)(desc_.get()), - ToCudnnDataType(tensor.dtype()), - static_cast(dims_with_group.size()), - const_cast(dims_with_group.data()), - const_cast(strides.data()))); - } - - private: - std::unique_ptr desc_; -}; - -class FilterDescriptor { - public: - using T = miopenTensorDescriptor; - struct Deleter { - void operator()(T* t) { - if (t != nullptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::miopenDestroyTensorDescriptor(t)); - t = nullptr; - } - } - }; - FilterDescriptor() { - T* raw_ptr; - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::miopenCreateTensorDescriptor(&raw_ptr)); - desc_.reset(raw_ptr); - } - T* desc() { return desc_.get(); } - T* desc() const { return desc_.get(); } - - void set(const phi::DenseTensor& tensor, - const miopenTensorFormat_t format, - const int groups = 1) { - PADDLE_ENFORCE_EQ( - format, - MIOPEN_TENSOR_NCHW, - phi::errors::InvalidArgument("format should ONLY be NCHW in MIOPEN.")); - auto dims = phi::vectorize(tensor.dims()); - std::vector strides(dims.size()); - strides[dims.size() - 1] = 1; - for (int i = dims.size() - 2; i >= 0; i--) { - strides[i] = dims[i + 1] * strides[i + 1]; - } - std::vector dims_with_group(dims.begin(), dims.end()); - if (groups > 1) { - dims_with_group[1] = dims_with_group[1] / groups; - } - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor( - (miopenTensorDescriptor_t)(desc_.get()), - ToCudnnDataType(tensor.dtype()), - static_cast(dims_with_group.size()), - const_cast(dims_with_group.data()), - const_cast(strides.data()))); - } - - private: - std::unique_ptr desc_; -}; - -class ConvolutionDescriptor { - public: - using T = miopenConvolutionDescriptor; - struct Deleter { - void operator()(T* t) { - if (t != nullptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::miopenDestroyConvolutionDescriptor(t)); - t = nullptr; - } - } - }; - ConvolutionDescriptor() { - T* raw_ptr; - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::miopenCreateConvolutionDescriptor(&raw_ptr)); - desc_.reset(raw_ptr); - } - T* desc() { return desc_.get(); } - T* desc() const { return desc_.get(); } - - void set(miopenDataType_t dtype, - const std::vector& pads, - const std::vector& strides, - const std::vector& dilations, - bool allow_tf32, - const int groups = 1) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenInitConvolutionNdDescriptor( - (miopenConvolutionDescriptor_t)desc_.get(), - static_cast(pads.size()), - const_cast(pads.data()), - const_cast(strides.data()), - const_cast(dilations.data()), - miopenConvolution)); - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetConvolutionGroupCount( - (miopenConvolutionDescriptor_t)desc_.get(), groups)); - } - - private: - std::unique_ptr desc_; -}; - -} // namespace gpu -} // namespace backends -} // namespace phi diff --git a/paddle/phi/backends/gpu/musa/miopen_helper.h b/paddle/phi/backends/gpu/musa/miopen_helper.h deleted file mode 100644 index 095f32ba460d0..0000000000000 --- a/paddle/phi/backends/gpu/musa/miopen_helper.h +++ /dev/null @@ -1,595 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include - -#include "gflags/gflags.h" - -#include "paddle/phi/backends/dynload/miopen.h" -#include "paddle/phi/common/bfloat16.h" -#include "paddle/phi/common/float16.h" -#include "paddle/phi/common/place.h" -#include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/enforce.h" -#include "paddle/phi/core/errors.h" -#include "paddle/phi/core/macros.h" - -// MIOPEN do not have epslion definition -#define CUDNN_BN_MIN_EPSILON 1e-05 - -DECLARE_bool(cudnn_deterministic); - -namespace phi { -namespace backends { -namespace gpu { - -inline const char* miopenGetErrorString(miopenStatus_t status) { - switch (status) { - case miopenStatusSuccess: - return "miopenStatusSuccess"; - case miopenStatusNotInitialized: - return "miopenStatusNotInitialized"; - case miopenStatusAllocFailed: - return "miopenStatusAllocFailed"; - case miopenStatusBadParm: - return "miopenStatusBadParm"; - case miopenStatusInternalError: - return "miopenStatusInternalError"; - case miopenStatusInvalidValue: - return "miopenStatusInvalidValue"; - case miopenStatusUnknownError: - return "miopenStatusUnknownError"; - case miopenStatusNotImplemented: - return "miopenStatusNotImplemented"; - default: - return "Unknown miopen error number"; - } -} - -// no use, but will have compiling error if not defined -#define CUDNN_VERSION_MIN(major, minor, patch) \ - (CUDNN_VERSION >= ((major)*1000 + (minor)*100 + (patch))) - -enum class DataLayout { // Not use - kNHWC, - kNCHW, - kNCDHW, - kNDHWC, // add, liyamei - kNCHW_VECT_C, -}; - -enum class PoolingMode { - kMaximum, - kMaximumDeterministic, - kAverageExclusive, - kAverageInclusive, -}; - -enum class ActivationMode { - kNone, // activation identity - kSigmoid, - kRelu, - kRelu6, - kReluX, - kTanh, - kBandPass, -}; - -inline miopenPoolingMode_t GetPoolingMode(const PoolingMode& mode) { - switch (mode) { - case PoolingMode::kMaximumDeterministic: - return miopenPoolingMax; - case PoolingMode::kAverageExclusive: - return miopenPoolingAverage; - case PoolingMode::kAverageInclusive: - return miopenPoolingAverageInclusive; - case PoolingMode::kMaximum: - return miopenPoolingMax; - default: - PADDLE_THROW( - phi::errors::Unimplemented("Unexpected MIOPEN pooling mode.")); - } -} - -inline ActivationMode StringToActivationMode(const std::string& str) { - if (str == "identity") { - return ActivationMode::kNone; - } else if (str == "sigmoid") { - return ActivationMode::kSigmoid; - } else if (str == "relu") { - return ActivationMode::kRelu; - } else if (str == "relu6") { - return ActivationMode::kRelu6; - } else if (str == "relux") { - return ActivationMode::kReluX; - } else if (str == "tanh") { - return ActivationMode::kTanh; - } else if (str == "bandpass") { - return ActivationMode::kBandPass; - } else { - PADDLE_THROW(phi::errors::Unimplemented( - "Unknown MIOPEN activation string: %s.", str)); - } -} - -template -class CudnnDataType; - -template <> -class CudnnDataType { - public: - static const miopenDataType_t type = miopenHalf; - // The scaling param type is float for HALF and FLOAT tensors - using ScalingParamType = const float; - using BatchNormParamType = float; - static ScalingParamType* kOne() { - static ScalingParamType v = 1.0; - return &v; - } - static ScalingParamType* kZero() { - static ScalingParamType v = 0.0; - return &v; - } -}; - -template <> -class CudnnDataType { - public: - static const miopenDataType_t type = miopenBFloat16; - // The scaling param type is float for HALF and FLOAT tensors - using ScalingParamType = const float; - using BatchNormParamType = float; - static ScalingParamType* kOne() { - static ScalingParamType v = 1.0; - return &v; - } - static ScalingParamType* kZero() { - static ScalingParamType v = 0.0; - return &v; - } -}; - -template <> -class CudnnDataType { - public: - static const miopenDataType_t type = miopenFloat; - using ScalingParamType = const float; - using BatchNormParamType = float; - static ScalingParamType* kOne() { - static ScalingParamType v = 1.0; - return &v; - } - static ScalingParamType* kZero() { - static ScalingParamType v = 0.0; - return &v; - } -}; - -inline miopenTensorFormat_t GetCudnnTensorFormat(const DataLayout& order) { - switch (order) { - case DataLayout::kNHWC: - return MIOPEN_TENSOR_NHWC; - case DataLayout::kNCHW: - return MIOPEN_TENSOR_NCHW; - case DataLayout::kNCDHW: - return MIOPEN_TENSOR_NCHW; - case DataLayout::kNDHWC: - return MIOPEN_TENSOR_NHWC; - default: - PADDLE_THROW(phi::errors::Unimplemented( - "MIOPEN has no equivalent dataLayout for input order.")); - } - return MIOPEN_TENSOR_NCHW; -} - -class ScopedTensorDescriptor { - public: - ScopedTensorDescriptor() { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::miopenCreateTensorDescriptor(&desc_)); - } - ~ScopedTensorDescriptor() PADDLE_MAY_THROW { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::miopenDestroyTensorDescriptor(desc_)); - } - - inline miopenTensorDescriptor_t descriptor(const miopenTensorFormat_t format, - const miopenDataType_t type, - const std::vector& dims, - const int groups = 1) { - // the format is not used now, will add later - std::vector strides(dims.size()); - strides[dims.size() - 1] = 1; - for (int i = dims.size() - 2; i >= 0; i--) { - strides[i] = dims[i + 1] * strides[i + 1]; - } - // Update tensor descriptor dims setting if groups > 1 - // NOTE: Here, Assume using NCHW or NCDHW order - std::vector dims_with_group(dims.begin(), dims.end()); - if (groups > 1) { - dims_with_group[1] = dims_with_group[1] / groups; - } - - // MIOPEN ONLY support data layout of NCHW - PADDLE_ENFORCE_EQ( - format, - MIOPEN_TENSOR_NCHW, - phi::errors::InvalidArgument("format should ONLY be NCHW in MIOPEN.")); - if (dims.size() == 4) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor( - desc_, - type, - dims_with_group.size(), - const_cast(dims_with_group.data()), - const_cast(strides.data()))); - } else if (dims.size() == 5) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor( - desc_, - type, - dims_with_group.size(), - const_cast(dims_with_group.data()), - const_cast(strides.data()))); - } - return desc_; - } - - template - inline miopenTensorDescriptor_t descriptor(const DataLayout& order, - const std::vector& dims, - const int groups = 1) { - return descriptor( - GetCudnnTensorFormat(order), CudnnDataType::type, dims, groups); - } - - inline miopenTensorDescriptor_t descriptor(const miopenDataType_t miopen_type, - const std::vector& dim, - const std::vector& stride) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor( - desc_, - miopen_type, - dim.size(), - const_cast(dim.data()), - const_cast(stride.data()))); - return desc_; - } - - template - inline miopenTensorDescriptor_t descriptor(const std::vector& dim, - const std::vector& stride) { - return descriptor(CudnnDataType::type, dim, stride); - } - - inline miopenTensorDescriptor_t desc() { return desc_; } - - private: - miopenTensorDescriptor_t desc_; - DISABLE_COPY_AND_ASSIGN(ScopedTensorDescriptor); -}; - -class ScopedDropoutDescriptor { - public: - ScopedDropoutDescriptor() { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::miopenCreateDropoutDescriptor(&desc_)); - } - ~ScopedDropoutDescriptor() PADDLE_MAY_THROW { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::miopenDestroyDropoutDescriptor(desc_)); - } - - inline miopenDropoutDescriptor_t descriptor(const miopenHandle_t& handle, - const phi::Place& place, - bool initialized, - float dropout_prob_, - phi::DenseTensor* dropout_state_, - int seed, - size_t state_size) { - if (dropout_state_ == nullptr) { // for no dropout or test - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::miopenSetDropoutDescriptor(desc_, - handle, - 0 /* dropout */, - nullptr, - 0 /* state_size */, - 0 /* seed */, - false, - false, - MIOPEN_RNG_PSEUDO_XORWOW)); - return desc_; - } - auto* dropout_state_data = dropout_state_->data(); - if (!initialized) { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::miopenSetDropoutDescriptor(desc_, - handle, - dropout_prob_, - dropout_state_data, - state_size, - seed, - false, - false, - MIOPEN_RNG_PSEUDO_XORWOW)); - } else { - auto dropout_state_dims = dropout_state_->dims(); - state_size = dropout_state_dims[0]; - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenRestoreDropoutDescriptor( - desc_, - handle, - dropout_prob_, - dropout_state_data, - state_size, - 0, - false, - false, - MIOPEN_RNG_PSEUDO_XORWOW)); - } - return desc_; - } - inline miopenDropoutDescriptor_t desc() { return desc_; } - - private: - miopenDropoutDescriptor_t desc_; - DISABLE_COPY_AND_ASSIGN(ScopedDropoutDescriptor); -}; - -class ScopedRNNDescriptor { - public: - ScopedRNNDescriptor() { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenCreateRNNDescriptor(&desc_)); - } - ~ScopedRNNDescriptor() PADDLE_MAY_THROW { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDestroyRNNDescriptor(desc_)); - } - - inline miopenRNNDescriptor_t desc() { return desc_; } - - private: - miopenRNNDescriptor_t desc_; - DISABLE_COPY_AND_ASSIGN(ScopedRNNDescriptor); -}; - -class ScopedFilterDescriptor { - public: - ScopedFilterDescriptor() { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::miopenCreateTensorDescriptor(&desc_)); - } - ~ScopedFilterDescriptor() PADDLE_MAY_THROW { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::miopenDestroyTensorDescriptor(desc_)); - } - - inline miopenTensorDescriptor_t descriptor(const miopenTensorFormat_t format, - const miopenDataType_t type, - const std::vector& kernel, - const int groups = 1) { - // filter layout: MCHW(MCDHW), where M is the number of - // output image channels, C is the number of input image channels, - // D is the depth of the filter, H is the height of the filter, and W is the - // width of the filter. - std::vector kernel_with_group(kernel.begin(), kernel.end()); - if (groups > 1) { - kernel_with_group[0] /= groups; - // NOTE: input filter(C) of the filter is already asserted to be C/groups. - } - std::vector stride_dim(kernel_with_group.size()); - stride_dim.push_back(1); - for (int k = kernel_with_group.size() - 2; k >= 0; k--) { - stride_dim[k] = stride_dim[k + 1] * kernel_with_group[k + 1]; - } - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor( - desc_, - type, - kernel_with_group.size(), - const_cast(kernel_with_group.data()), - const_cast(stride_dim.data()))); - return desc_; - } - - template - inline miopenTensorDescriptor_t descriptor(const DataLayout& order, - const std::vector& kernel, - const int groups = 1) { - return descriptor( - GetCudnnTensorFormat(order), CudnnDataType::type, kernel, groups); - } - - inline miopenTensorDescriptor_t desc() { return desc_; } - - private: - miopenTensorDescriptor_t desc_; - DISABLE_COPY_AND_ASSIGN(ScopedFilterDescriptor); -}; - -class ScopedConvolutionDescriptor { - public: - ScopedConvolutionDescriptor() { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::miopenCreateConvolutionDescriptor(&desc_)); - } - ~ScopedConvolutionDescriptor() PADDLE_MAY_THROW { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::miopenDestroyConvolutionDescriptor(desc_)); - } - - inline miopenConvolutionDescriptor_t descriptor( - miopenDataType_t type, - const std::vector& pads, - const std::vector& strides, - const std::vector& dilations) { - PADDLE_ENFORCE_EQ(pads.size(), - strides.size(), - phi::errors::InvalidArgument( - "The size of pads and strides should be equal. But " - "received size of pads is %d, size of strides is %d.", - pads.size(), - strides.size())); - PADDLE_ENFORCE_EQ( - pads.size(), - dilations.size(), - phi::errors::InvalidArgument( - "The size of pads and dilations should be equal. But received size " - "of pads is %d, size of dilations is %d.", - pads.size(), - dilations.size())); - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenInitConvolutionNdDescriptor( - desc_, - pads.size(), - const_cast(pads.data()), - const_cast(strides.data()), - const_cast(dilations.data()), - miopenConvolution)); - return desc_; - } - - template - inline miopenConvolutionDescriptor_t descriptor( - const std::vector& pads, - const std::vector& strides, - const std::vector& dilations) { - return descriptor(CudnnDataType::type, pads, strides, dilations); - } - - private: - miopenConvolutionDescriptor_t desc_; - DISABLE_COPY_AND_ASSIGN(ScopedConvolutionDescriptor); -}; - -class ScopedPoolingDescriptor { - public: - ScopedPoolingDescriptor() { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::miopenCreatePoolingDescriptor(&desc_)); - } - ~ScopedPoolingDescriptor() PADDLE_MAY_THROW { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::miopenDestroyPoolingDescriptor(desc_)); - } - - inline miopenPoolingDescriptor_t descriptor(const PoolingMode& mode, - const std::vector& kernel, - const std::vector& pads, - const std::vector& strides) { - PADDLE_ENFORCE_EQ(kernel.size(), - pads.size(), - phi::errors::InvalidArgument( - "The size of kernel and pads should be equal. But " - "received size of kernel is %d, size of pads is %d.", - kernel.size(), - pads.size())); - PADDLE_ENFORCE_EQ( - kernel.size(), - strides.size(), - phi::errors::InvalidArgument( - "The size of kernel and strides should be equal. But " - "received size of kernel is %d, size of strides is %d.", - kernel.size(), - strides.size())); - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetNdPoolingDescriptor( - desc_, - GetPoolingMode(mode), - kernel.size(), - const_cast(kernel.data()), - const_cast(pads.data()), - const_cast(strides.data()))); - return desc_; - } - - private: - miopenPoolingDescriptor_t desc_; - DISABLE_COPY_AND_ASSIGN(ScopedPoolingDescriptor); -}; - -class ScopedActivationDescriptor { - public: - ScopedActivationDescriptor() { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::miopenCreateActivationDescriptor(&desc_)); - } - ~ScopedActivationDescriptor() PADDLE_MAY_THROW { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::miopenDestroyActivationDescriptor(desc_)); - } - - template - inline miopenActivationDescriptor_t descriptor( - const std::string& act, double value_max = static_cast(0.)) { - double relu_ceiling = 0.0; - ActivationMode activation_mode = StringToActivationMode(act); - miopenActivationMode_t mode; - switch (activation_mode) { - case ActivationMode::kNone: - mode = miopenActivationPASTHRU; - break; - case ActivationMode::kRelu6: - relu_ceiling = 6.0; - mode = miopenActivationCLIPPEDRELU; - break; - case ActivationMode::kReluX: - relu_ceiling = value_max; - mode = miopenActivationCLIPPEDRELU; - break; - case ActivationMode::kRelu: - mode = miopenActivationRELU; - break; - case ActivationMode::kSigmoid: - mode = miopenActivationLOGISTIC; - break; - case ActivationMode::kTanh: - mode = miopenActivationTANH; - break; - default: - PADDLE_THROW(phi::errors::Unimplemented( - "Unrecognized MIOPEN activation mode: %d.", - static_cast(activation_mode))); - } - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetActivationDescriptor( - desc_, mode, relu_ceiling, 0.0, 0.0)); - return desc_; - } - - private: - miopenActivationDescriptor_t desc_; - DISABLE_COPY_AND_ASSIGN(ScopedActivationDescriptor); -}; - -class ScopedCTCLossDescriptor { - public: - ScopedCTCLossDescriptor() { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::miopenCreateCTCLossDescriptor(&desc_)); - } - ~ScopedCTCLossDescriptor() PADDLE_MAY_THROW { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::miopenDestroyCTCLossDescriptor(desc_)); - } - - template - inline miopenCTCLossDescriptor_t descriptor() { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetCTCLossDescriptor( - desc_, CudnnDataType::type, 0, false)); - return desc_; - } - - private: - miopenCTCLossDescriptor_t desc_; - DISABLE_COPY_AND_ASSIGN(ScopedCTCLossDescriptor); -}; - -} // namespace gpu -} // namespace backends -} // namespace phi diff --git a/paddle/phi/backends/gpu/musa/musa_info.cc b/paddle/phi/backends/gpu/musa/musa_info.cc index 6579ce63f21f6..f2087e4d7f4fc 100644 --- a/paddle/phi/backends/gpu/musa/musa_info.cc +++ b/paddle/phi/backends/gpu/musa/musa_info.cc @@ -88,16 +88,15 @@ int GetGPUComputeCapability(int id) { "but received id is: %d. GPU count is: %d.", id, GetGPUDeviceCount())); - return 100; - //int major, minor; - //auto major_error_code = musaDeviceGetAttribute( - // &major, musaDeviceAttributeComputeCapabilityMajor, id); - //auto minor_error_code = musaDeviceGetAttribute( - // &minor, musaDeviceAttributeComputeCapabilityMinor, id); - - //PADDLE_ENFORCE_GPU_SUCCESS(major_error_code); - //PADDLE_ENFORCE_GPU_SUCCESS(minor_error_code); - //return major * 100 + minor; + int major, minor; + auto major_error_code = musaDeviceGetAttribute( + &major, musaDevAttrComputeCapabilityMajor, id); + auto minor_error_code = musaDeviceGetAttribute( + &minor, musaDevAttrComputeCapabilityMinor, id); + + PADDLE_ENFORCE_GPU_SUCCESS(major_error_code); + PADDLE_ENFORCE_GPU_SUCCESS(minor_error_code); + return major * 100 + minor; } int GetGPURuntimeVersion(int id) { @@ -138,7 +137,8 @@ int GetGPUMultiProcessors(int id) { GetGPUDeviceCount())); int count; PADDLE_ENFORCE_GPU_SUCCESS( - musaDeviceGetAttribute(&count, musaDeviceAttributeMultiprocessorCount, id)); + + musaDeviceGetAttribute(&count, musaDevAttrMultiProcessorCount, id)); return count; } @@ -152,7 +152,7 @@ int GetGPUMaxThreadsPerMultiProcessor(int id) { GetGPUDeviceCount())); int count; PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceGetAttribute( - &count, musaDeviceAttributeMaxThreadsPerMultiProcessor, id)); + &count, musaDevAttrMaxThreadsPerMultiProcessor, id)); return count; } @@ -167,7 +167,7 @@ int GetGPUMaxThreadsPerBlock(int id) { GetGPUDeviceCount())); int count; PADDLE_ENFORCE_GPU_SUCCESS( - musaDeviceGetAttribute(&count, musaDeviceAttributeMaxThreadsPerBlock, id)); + musaDeviceGetAttribute(&count, musaDevAttrMaxThreadsPerBlock, id)); return count; } @@ -188,17 +188,17 @@ std::array GetGpuMaxGridDimSize(int id) { std::array ret; int size; auto error_code_x = - musaDeviceGetAttribute(&size, musaDeviceAttributeMaxGridDimX, id); + musaDeviceGetAttribute(&size, musaDevAttrMaxGridDimX, id); PADDLE_ENFORCE_GPU_SUCCESS(error_code_x); ret[0] = size; auto error_code_y = - musaDeviceGetAttribute(&size, musaDeviceAttributeMaxGridDimY, id); + musaDeviceGetAttribute(&size, musaDevAttrMaxGridDimY, id); PADDLE_ENFORCE_GPU_SUCCESS(error_code_y); ret[1] = size; auto error_code_z = - musaDeviceGetAttribute(&size, musaDeviceAttributeMaxGridDimZ, id); + musaDeviceGetAttribute(&size, musaDevAttrMaxGridDimZ, id); PADDLE_ENFORCE_GPU_SUCCESS(error_code_z); ret[2] = size; return ret; diff --git a/paddle/phi/backends/gpu/musa/rocm_device_function.h b/paddle/phi/backends/gpu/musa/rocm_device_function.h deleted file mode 100644 index 6f5d684075f0f..0000000000000 --- a/paddle/phi/backends/gpu/musa/rocm_device_function.h +++ /dev/null @@ -1,165 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -// NOTE(): support float16 to half in header file. -#define PADDLE_CUDA_FP16 -#include "paddle/phi/common/bfloat16.h" -#include "paddle/phi/common/complex.h" -#include "paddle/phi/common/float16.h" - -namespace phi { -namespace backends { -namespace gpu { - -#define CREATE_SHFL_MASK(mask, predicate) mask = __ballot((predicate)) - -#define CUDA_LAUNCH_KERNEL_BASE(dim, ...) \ - case (dim): { \ - constexpr auto kPowerOfTwoDim = (dim); \ - __VA_ARGS__; \ - } break - -#define CUDA_LAUNCH_KERNEL_HELPER(...) \ - CUDA_LAUNCH_KERNEL_BASE(1024, ##__VA_ARGS__); \ - CUDA_LAUNCH_KERNEL_BASE(512, ##__VA_ARGS__); \ - CUDA_LAUNCH_KERNEL_BASE(256, ##__VA_ARGS__); \ - CUDA_LAUNCH_KERNEL_BASE(128, ##__VA_ARGS__); \ - CUDA_LAUNCH_KERNEL_BASE(64, ##__VA_ARGS__); \ - CUDA_LAUNCH_KERNEL_BASE(32, ##__VA_ARGS__); - -template -__forceinline__ __device__ T -CudaShuffleDownSync(unsigned mask, T val, int delta, int width = warpSize) { - return __shfl_down(val, delta, width); -} - -template -__forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, - T val, - int width = warpSize) { - return __shfl_xor(val, width); -} - -template <> -__forceinline__ __device__ phi::dtype::float16 CudaShuffleDownSync( - unsigned mask, phi::dtype::float16 val, int delta, int width) { - return phi::dtype::float16(__shfl_down( - static_cast(val), static_cast(delta), width)); -} - -template <> -__forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleDownSync( - unsigned mask, phi::dtype::bfloat16 val, int delta, int width) { - return phi::dtype::bfloat16(__shfl_down( - static_cast(val), static_cast(delta), width)); -} - -template <> -__forceinline__ __device__ phi::dtype::complex CudaShuffleDownSync( - unsigned mask, phi::dtype::complex val, int delta, int width) { - float real = __shfl_down(val.real, delta, width); - float imag = __shfl_down(val.imag, delta, width); - return phi::dtype::complex(real, imag); -} - -template <> -__forceinline__ __device__ phi::dtype::complex CudaShuffleDownSync( - unsigned mask, phi::dtype::complex val, int delta, int width) { - double real = __shfl_down(val.real, delta, width); - double imag = __shfl_down(val.imag, delta, width); - return phi::dtype::complex(real, imag); -} - -template <> -__forceinline__ __device__ phi::dtype::float16 CudaShuffleXorSync( - unsigned mask, phi::dtype::float16 val, int width) { - return phi::dtype::float16(__shfl_xor(static_cast(val), width)); -} - -template <> -__forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleXorSync( - unsigned mask, phi::dtype::bfloat16 val, int width) { - return phi::dtype::bfloat16(__shfl_xor(static_cast(val), width)); -} - -template <> -__forceinline__ __device__ phi::dtype::complex CudaShuffleXorSync( - unsigned mask, phi::dtype::complex val, int width) { - float real = __shfl_xor(val.real, width); - float imag = __shfl_xor(val.imag, width); - return phi::dtype::complex(real, imag); -} - -template <> -__forceinline__ __device__ phi::dtype::complex CudaShuffleXorSync( - unsigned mask, phi::dtype::complex val, int width) { - double real = __shfl_xor(val.real, width); - double imag = __shfl_xor(val.imag, width); - return phi::dtype::complex(real, imag); -} - -template -__forceinline__ __device__ T -CudaShuffleSync(unsigned mask, T val, int src_line, int width = 32) { - return __shfl(val, src_line, width); -} - -template -HOSTDEVICE T Infinity() { - return INFINITY; -} - -template -__device__ T reduceSum(T val, int tid, int len) { - // NOTE(zcd): The warp size should be taken from the - // parameters of the GPU but not specified as 32 simply. - // To make the reduceSum more efficiently, - // I use Warp-Level Parallelism and assume the Warp size - // is 32 which may be different for different GPU, - // but most card's warp size is 32. -#ifdef PADDLE_WITH_HIP - const int warpSize = 64; -#else - const int warpSize = 32; -#endif - __shared__ T shm[warpSize]; - unsigned mask = 0u; - CREATE_SHFL_MASK(mask, tid < len); - - for (int offset = warpSize / 2; offset > 0; offset /= 2) - val += phi::backends::gpu::CudaShuffleDownSync(mask, val, offset); - - if (tid < warpSize) shm[tid] = 0; - __syncthreads(); - - if (tid % warpSize == 0) { - shm[tid / warpSize] = val; - } - __syncthreads(); - - CREATE_SHFL_MASK(mask, tid < warpSize); - - if (tid < warpSize) { - val = shm[tid]; - for (int offset = warpSize / 2; offset > 0; offset /= 2) - val += phi::backends::gpu::CudaShuffleDownSync(mask, val, offset); - } - return val; -} - -} // namespace gpu -} // namespace backends -} // namespace phi diff --git a/paddle/phi/backends/gpu/musa/rocm_helper.h b/paddle/phi/backends/gpu/musa/rocm_helper.h deleted file mode 100644 index 07fdde5a2f417..0000000000000 --- a/paddle/phi/backends/gpu/musa/rocm_helper.h +++ /dev/null @@ -1,74 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -namespace phi { -namespace backends { -namespace gpu { - -/* - * Summary: Grid stride looping macro in CUDA kernel - * - * [ Why need this macro? ] - * - * The original looping in CUDA kernel is: - * - * `for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ - * i += blockDim.x * gridDim.x)` - * - * This for condition is risky. The value of `blockIdx.x * blockDim.x` - * may be large, such as over 1GB, the first iteration is no problem here, - * but when `i += blockDim.x * gridDim.x` is executed, the value of i - * will greater than INT_MAX and overflow becomes negative value, at - * this time, the cycle condition `i < (n)` is still satisfied, so it - * will cause illegal access to cuda memory. - * - * Here is a real example in ERINE, it will trigger above error. - * The related data are: - * - blockIdx.x = 2172938 - * - blockDim.x = 512 - * - blockIdx.x * blockDim.x = 1112543864 - * - INT_MAX = 2147483647 - * - * So we polish the for condition as follow, the int64_t __index__ will - * prevent overflow in the loop increment. - * - * Parameters: - * - i: loop index - * - num: total element numbers - * - * Examples: - * template - * __global__ void Scale(T* logit_grad, const T* loss_grad, const int num, - * const int d, const int remain) { - * CUDA_KERNEL_LOOP(index, num) { - * int idx_n = index / d; - * int idx_remain = index % remain; - * logit_grad[index] *= loss_grad[idx_n * remain + idx_remain]; - * } - * } - * - */ - -#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \ - int64_t __index__ = \ - static_cast(hipBlockIdx_x) * hipBlockDim_x + hipThreadIdx_x; \ - int64_t __stride__ = static_cast(hipBlockDim_x) * hipGridDim_x; \ - for (index_type i = __index__; __index__ < (num); \ - __index__ += __stride__, i = __index__) - -} // namespace gpu -} // namespace backends -} // namespace phi diff --git a/paddle/phi/backends/gpu/musa/rocm_info.cc b/paddle/phi/backends/gpu/musa/rocm_info.cc deleted file mode 100644 index 32c7c329253b1..0000000000000 --- a/paddle/phi/backends/gpu/musa/rocm_info.cc +++ /dev/null @@ -1,334 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include - -#include "paddle/phi/backends/gpu/gpu_info.h" - -#include "paddle/phi/core/enforce.h" - -static std::once_flag g_device_props_size_init_flag; -static std::vector> g_device_props_init_flags; -static std::vector g_device_props; - -namespace phi { -namespace backends { -namespace gpu { - -int DnnVersion() { - if (!dynload::HasCUDNN()) return -1; - size_t version_major, version_minor, version_patch; - PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenGetVersion( - &version_major, &version_minor, &version_patch)); - return version_major * 100 + version_minor * 10 + version_patch; -} - -static int GetGPUDeviceCountImpl() { - int driverVersion = 0; - musaError_t status = musaDriverGetVersion(&driverVersion); - - if (!(status == gpuSuccess && driverVersion != 0)) { - // No GPU driver - VLOG(2) << "GPU Driver Version can't be detected. No GPU driver!"; - return 0; - } - - const auto *cuda_visible_devices = std::getenv("MUSA_VISIBLE_DEVICES"); - - if (cuda_visible_devices != nullptr) { - std::string cuda_visible_devices_str(cuda_visible_devices); - if (!cuda_visible_devices_str.empty()) { - cuda_visible_devices_str.erase( - 0, cuda_visible_devices_str.find_first_not_of('\'')); - cuda_visible_devices_str.erase( - cuda_visible_devices_str.find_last_not_of('\'') + 1); - cuda_visible_devices_str.erase( - 0, cuda_visible_devices_str.find_first_not_of('\"')); - cuda_visible_devices_str.erase( - cuda_visible_devices_str.find_last_not_of('\"') + 1); - } - if (std::all_of(cuda_visible_devices_str.begin(), - cuda_visible_devices_str.end(), - [](char ch) { return ch == ' '; })) { - VLOG(2) << "MUSA_VISIBLE_DEVICES is set to be " - "empty. No GPU detected."; - return 0; - } - } - int count; - PADDLE_ENFORCE_GPU_SUCCESS(musaGetDeviceCount(&count)); - return count; -} - -int GetGPUDeviceCount() { - // cache the count - static auto dev_cnt = GetGPUDeviceCountImpl(); - return dev_cnt; -} - -int GetGPUComputeCapability(int id) { - PADDLE_ENFORCE_LT( - id, - GetGPUDeviceCount(), - phi::errors::InvalidArgument("Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, - GetGPUDeviceCount())); - int major, minor; - auto major_error_code = musaDeviceGetAttribute( - &major, musaDeviceAttributeComputeCapabilityMajor, id); - auto minor_error_code = musaDeviceGetAttribute( - &minor, musaDeviceAttributeComputeCapabilityMinor, id); - - PADDLE_ENFORCE_GPU_SUCCESS(major_error_code); - PADDLE_ENFORCE_GPU_SUCCESS(minor_error_code); - return major * 100 + minor; -} - -int GetGPURuntimeVersion(int id) { - PADDLE_ENFORCE_LT( - id, - GetGPUDeviceCount(), - phi::errors::InvalidArgument("Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, - GetGPUDeviceCount())); - int runtime_version = 0; - PADDLE_ENFORCE_GPU_SUCCESS(musaRuntimeGetVersion(&runtime_version)); - return runtime_version; -} - -int GetGPUDriverVersion(int id) { - PADDLE_ENFORCE_LT( - id, - GetGPUDeviceCount(), - phi::errors::InvalidArgument("Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, - GetGPUDeviceCount())); - int driver_version = 0; - PADDLE_ENFORCE_GPU_SUCCESS(musaDriverGetVersion(&driver_version)); - return driver_version; -} - -bool TensorCoreAvailable() { return false; } - -int GetGPUMultiProcessors(int id) { - PADDLE_ENFORCE_LT( - id, - GetGPUDeviceCount(), - phi::errors::InvalidArgument("Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, - GetGPUDeviceCount())); - int count; - PADDLE_ENFORCE_GPU_SUCCESS( - musaDeviceGetAttribute(&count, musaDeviceAttributeMultiprocessorCount, id)); - return count; -} - -int GetGPUMaxThreadsPerMultiProcessor(int id) { - PADDLE_ENFORCE_LT( - id, - GetGPUDeviceCount(), - phi::errors::InvalidArgument("Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, - GetGPUDeviceCount())); - int count; - PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceGetAttribute( - &count, musaDeviceAttributeMaxThreadsPerMultiProcessor, id)); - - return count; -} - -int GetGPUMaxThreadsPerBlock(int id) { - PADDLE_ENFORCE_LT( - id, - GetGPUDeviceCount(), - phi::errors::InvalidArgument("Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, - GetGPUDeviceCount())); - int count; - PADDLE_ENFORCE_GPU_SUCCESS( - musaDeviceGetAttribute(&count, musaDeviceAttributeMaxThreadsPerBlock, id)); - return count; -} - -int GetCurrentDeviceId() { - int device_id; - PADDLE_ENFORCE_GPU_SUCCESS(musaGetDevice(&device_id)); - return device_id; -} - -std::array GetGpuMaxGridDimSize(int id) { - PADDLE_ENFORCE_LT( - id, - GetGPUDeviceCount(), - phi::errors::InvalidArgument("Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, - GetGPUDeviceCount())); - std::array ret; - int size; - auto error_code_x = - musaDeviceGetAttribute(&size, musaDeviceAttributeMaxGridDimX, id); - PADDLE_ENFORCE_GPU_SUCCESS(error_code_x); - ret[0] = size; - - auto error_code_y = - musaDeviceGetAttribute(&size, musaDeviceAttributeMaxGridDimY, id); - PADDLE_ENFORCE_GPU_SUCCESS(error_code_y); - ret[1] = size; - - auto error_code_z = - musaDeviceGetAttribute(&size, musaDeviceAttributeMaxGridDimZ, id); - PADDLE_ENFORCE_GPU_SUCCESS(error_code_z); - ret[2] = size; - return ret; -} - -std::pair GetGpuStreamPriorityRange() { - int least_priority, greatest_priority; - PADDLE_ENFORCE_GPU_SUCCESS( - musaDeviceGetStreamPriorityRange(&least_priority, &greatest_priority)); - return std::make_pair(least_priority, greatest_priority); -} - -const gpuDeviceProp &GetDeviceProperties(int id) { - std::call_once(g_device_props_size_init_flag, [&] { - int gpu_num = 0; - gpu_num = GetGPUDeviceCount(); - g_device_props_init_flags.resize(gpu_num); - g_device_props.resize(gpu_num); - for (int i = 0; i < gpu_num; ++i) { - g_device_props_init_flags[i] = std::make_unique(); - } - }); - - if (id == -1) { - id = GetCurrentDeviceId(); - } - - if (id < 0 || id >= static_cast(g_device_props.size())) { - PADDLE_THROW(phi::errors::OutOfRange( - "The device id %d is out of range [0, %d), where %d is the number of " - "devices on this machine. Because the device id should be greater than " - "or equal to zero and smaller than the number of gpus. Please input " - "appropriate device again!", - id, - static_cast(g_device_props.size()), - static_cast(g_device_props.size()))); - } - - std::call_once(*(g_device_props_init_flags[id]), [&] { - PADDLE_ENFORCE_GPU_SUCCESS(musaGetDeviceProperties(&g_device_props[id], id)); - }); - - return g_device_props[id]; -} - -void SetDeviceId(int id) { - // TODO(qijun): find a better way to cache the cuda device count - PADDLE_ENFORCE_LT( - id, - GetGPUDeviceCount(), - phi::errors::InvalidArgument("Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, - GetGPUDeviceCount())); - PADDLE_RETRY_CUDA_SUCCESS(musaSetDevice(id)); -} - -void GpuMemcpyAsync(void *dst, - const void *src, - size_t count, - gpuMemcpyKind kind, - gpuStream_t stream) { - PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpyAsync(dst, src, count, kind, stream)); -} - -void GpuMemcpySync(void *dst, - const void *src, - size_t count, - gpuMemcpyKind kind) { - PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpy(dst, src, count, kind)); -} - -void GpuMemcpyPeerAsync(void *dst, - int dst_device, - const void *src, - int src_device, - size_t count, - gpuStream_t stream) { - PADDLE_ENFORCE_GPU_SUCCESS( - musaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream)); -} - -void GpuMemcpyPeerSync( - void *dst, int dst_device, const void *src, int src_device, size_t count) { - PADDLE_ENFORCE_GPU_SUCCESS( - musaMemcpyPeer(dst, dst_device, src, src_device, count)); -} - -void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream) { - PADDLE_ENFORCE_GPU_SUCCESS(musaMemsetAsync(dst, value, count, stream)); -} - -void GpuStreamSync(gpuStream_t stream) { - PADDLE_ENFORCE_GPU_SUCCESS(musaStreamSynchronize(stream)); -} - -void GpuDestroyStream(gpuStream_t stream) { - PADDLE_ENFORCE_GPU_SUCCESS(musaStreamDestroy(stream)); -} - -void GpuDeviceSync() { PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize()); } - -gpuError_t GpuGetLastError() { return musaGetLastError(); } - -bool IsGPUManagedMemorySupported(int dev_id) { - PADDLE_ENFORCE_LT( - dev_id, - GetGPUDeviceCount(), - phi::errors::InvalidArgument("Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - dev_id, - GetGPUDeviceCount())); - // TODO(qili93): Hygon DTK (21.04 and 22.04) not support - // musaDeviceAttributeManagedMemory, temporary disable by default, to be - // verified in next DTK release - return false; -} - -bool IsGPUManagedMemoryOversubscriptionSupported(int dev_id) { - PADDLE_ENFORCE_LT( - dev_id, - GetGPUDeviceCount(), - phi::errors::InvalidArgument("Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - dev_id, - GetGPUDeviceCount())); -#ifdef __linux__ - return IsGPUManagedMemorySupported(dev_id) && - GetGPUComputeCapability(dev_id) >= 60; -#else - return false; -#endif -} - -} // namespace gpu -} // namespace backends -} // namespace phi