diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 03c252909d923..5076fc19e078b 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -115,6 +115,23 @@ class AllocatorFacadePrivate { break; } + case AllocatorStrategy::kSamplePool: { + InitNaiveBestFitCPUAllocator(); +#ifdef PADDLE_WITH_XPU + for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) { + InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id)); + } +#endif +#ifdef PADDLE_WITH_CUDA + for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); + ++dev_id) { + InitSampleCUDAAllocator(platform::CUDAPlace(dev_id)); + } + InitNaiveBestFitCUDAPinnedAllocator(); +#endif + break; + } + default: { PADDLE_THROW(platform::errors::InvalidArgument( "Unsupported allocator strategy: %d", static_cast(strategy))); @@ -123,7 +140,8 @@ class AllocatorFacadePrivate { InitZeroSizeAllocators(); InitSystemAllocators(); - if (FLAGS_gpu_allocator_retry_time > 0) { + if (strategy != AllocatorStrategy::kSamplePool && + FLAGS_gpu_allocator_retry_time > 0) { WrapCUDARetryAllocator(FLAGS_gpu_allocator_retry_time); } @@ -188,6 +206,10 @@ class AllocatorFacadePrivate { allocators_[p] = std::make_shared( cuda_allocator, platform::GpuMinChunkSize()); } + void InitSampleCUDAAllocator(platform::CUDAPlace p) { + auto cuda_allocator = std::make_shared(p); + allocators_[p] = std::make_shared(cuda_allocator); + } #endif #ifdef PADDLE_WITH_XPU diff --git a/paddle/fluid/memory/allocation/allocator_strategy.cc b/paddle/fluid/memory/allocation/allocator_strategy.cc index 518b31e943048..e6f03b49c4add 100644 --- a/paddle/fluid/memory/allocation/allocator_strategy.cc +++ b/paddle/fluid/memory/allocation/allocator_strategy.cc @@ -35,7 +35,9 @@ static AllocatorStrategy GetStrategyFromFlag() { if (FLAGS_allocator_strategy == "thread_local") { return AllocatorStrategy::kThreadLocal; } - + if (FLAGS_allocator_strategy == "sample_pool") { + return AllocatorStrategy::kSamplePool; + } PADDLE_THROW(platform::errors::InvalidArgument( "Unsupported allocator strategy: %s, condicates are naive_best_fit, " "auto_growth or thread_local.", diff --git a/paddle/fluid/memory/allocation/allocator_strategy.h b/paddle/fluid/memory/allocation/allocator_strategy.h index 0db9d93e3e645..b1d9129cee3a5 100644 --- a/paddle/fluid/memory/allocation/allocator_strategy.h +++ b/paddle/fluid/memory/allocation/allocator_strategy.h @@ -18,7 +18,12 @@ namespace paddle { namespace memory { namespace allocation { -enum class AllocatorStrategy { kNaiveBestFit, kAutoGrowth, kThreadLocal }; +enum class AllocatorStrategy { + kNaiveBestFit, + kAutoGrowth, + kThreadLocal, + kSamplePool +}; extern AllocatorStrategy GetAllocatorStrategy(); diff --git a/paddle/fluid/memory/allocation/retry_allocator.cc b/paddle/fluid/memory/allocation/retry_allocator.cc index ae6af53241dfe..79e428b99ce30 100644 --- a/paddle/fluid/memory/allocation/retry_allocator.cc +++ b/paddle/fluid/memory/allocation/retry_allocator.cc @@ -14,6 +14,11 @@ #include "paddle/fluid/memory/allocation/retry_allocator.h" +DEFINE_int32(sample_max_bin_bytes, 2048, "sample max bytes in pool MB"); +DEFINE_int32(sample_bin_growth, 2, "sample growth memory by bin"); +DEFINE_int32(sample_min_bin, 8, "sample min bin number"); +DEFINE_bool(sample_debug_info, false, "sample print debug info"); + namespace paddle { namespace memory { namespace allocation { @@ -94,6 +99,145 @@ Allocation* RetryAllocator::AllocateImpl(size_t size) { } } +static const unsigned int INVALID_BIN = (unsigned int)-1; +SampleAllocator::BlockDescriptor::BlockDescriptor(Allocation* ptr) + : d_ptr(ptr), bytes(0), used(0), bin(INVALID_BIN) {} +SampleAllocator::BlockDescriptor::BlockDescriptor() + : d_ptr(NULL), bytes(0), used(0), bin(INVALID_BIN) {} +bool SampleAllocator::BlockDescriptor::ptrcompare(const BlockDescriptor& a, + const BlockDescriptor& b) { + return (a.d_ptr < b.d_ptr); +} +bool SampleAllocator::BlockDescriptor::sizecompare(const BlockDescriptor& a, + const BlockDescriptor& b) { + return (a.bytes < b.bytes); +} +SampleAllocator::SampleAllocator(std::shared_ptr allocator) + : allocator_(std::move(allocator)), + bin_growth_(FLAGS_sample_bin_growth), + min_bin_(FLAGS_sample_min_bin), + min_bin_bytes_(pow(bin_growth_, min_bin_)), + max_bin_bytes_(FLAGS_sample_max_bin_bytes * 1024 * 1024), + cached_blocks_(BlockDescriptor::sizecompare), + live_blocks_(BlockDescriptor::ptrcompare) { + PADDLE_ENFORCE_NOT_NULL( + allocator_, platform::errors::InvalidArgument( + "Underlying allocator of SampleAllocator is NULL")); + VLOG(0) << "SampleAllocator init"; +} +void SampleAllocator::FreeImpl(Allocation* allocation) { + if (allocation == NULL) { + return; + } + bool recached = false; + BlockDescriptor search_key(allocation); + + mutex_.lock(); + auto block_itr = live_blocks_.find(search_key); + if (block_itr != live_blocks_.end()) { + search_key = *block_itr; + live_blocks_.erase(block_itr); + cached_bytes_.live -= search_key.bytes; + cached_bytes_.used -= search_key.used; + if (search_key.bin != INVALID_BIN) { + recached = true; + cached_blocks_.insert(search_key); + cached_bytes_.free += search_key.bytes; + } + } + mutex_.unlock(); + + if (!recached) { + allocator_->Free(allocation); + } + + if (FLAGS_sample_debug_info && search_key.bin == INVALID_BIN) { + VLOG(0) << "pool total: " << (cached_bytes_.live >> 20) + << "MB, used: " << (cached_bytes_.used >> 20) << "MB, free" + << (cached_bytes_.free >> 20) + << "MB, free big memory: " << search_key.bytes << " bytes"; + } +} +// alloc memory +Allocation* SampleAllocator::AllocateImpl(size_t bytes) { + // Create a block descriptor for the requested allocation + bool found = false; + BlockDescriptor search_key; + search_key.used = bytes; + if (bytes > max_bin_bytes_) { + search_key.bin = INVALID_BIN; + search_key.bytes = bytes; + } else { + if (bytes < min_bin_bytes_) { + search_key.bin = min_bin_; + search_key.bytes = min_bin_bytes_; + } else { + search_key.bin = 0; + search_key.bytes = 1; + while (search_key.bytes < bytes) { + search_key.bytes *= bin_growth_; + ++search_key.bin; + } + } + mutex_.lock(); + auto block_itr = cached_blocks_.lower_bound(search_key); + if ((block_itr != cached_blocks_.end()) && + (block_itr->bin == search_key.bin)) { + found = true; + search_key = *block_itr; + search_key.used = bytes; + live_blocks_.insert(search_key); + // Remove from free blocks + cached_bytes_.free -= search_key.bytes; + cached_bytes_.live += search_key.bytes; + cached_bytes_.used += search_key.used; + cached_blocks_.erase(block_itr); + } + mutex_.unlock(); + } + if (!found) { + try { + search_key.d_ptr = allocator_->Allocate(search_key.bytes).release(); + } catch (BadAlloc&) { + // release all free cache + FreeAllCache(); + // cuda malloc + search_key.d_ptr = allocator_->Allocate(search_key.bytes).release(); + } catch (...) { + throw; + } + mutex_.lock(); + live_blocks_.insert(search_key); + cached_bytes_.live += search_key.bytes; + cached_bytes_.used += search_key.used; + mutex_.unlock(); + + if (FLAGS_sample_debug_info && search_key.bin == INVALID_BIN) { + VLOG(0) << "pool total: " << (cached_bytes_.live >> 20) + << "MB, used: " << (cached_bytes_.used >> 20) << "MB, free" + << (cached_bytes_.free >> 20) + << "MB, cuda alloc big memory: " << bytes << " bytes"; + } + } + return search_key.d_ptr; +} +void SampleAllocator::FreeAllCache(void) { + mutex_.lock(); + if (cached_blocks_.empty()) { + mutex_.unlock(); + return; + } + while (!cached_blocks_.empty()) { + auto begin = cached_blocks_.begin(); + allocator_->Free(begin->d_ptr); + cached_bytes_.free -= begin->bytes; + cached_blocks_.erase(begin); + } + mutex_.unlock(); +} + +void SampleAllocator::GetMemInfo(TotalBytes* info) { *info = cached_bytes_; } + } // namespace allocation } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/allocation/retry_allocator.h b/paddle/fluid/memory/allocation/retry_allocator.h index 031a5e2b97f17..557cb715fabe9 100644 --- a/paddle/fluid/memory/allocation/retry_allocator.h +++ b/paddle/fluid/memory/allocation/retry_allocator.h @@ -19,8 +19,8 @@ #include // NOLINT #include #include // NOLINT +#include #include - #include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/platform/enforce.h" @@ -45,9 +45,9 @@ class RetryAllocator : public Allocator { bool IsAllocThreadSafe() const override { return true; } protected: - void FreeImpl(Allocation* allocation) override; - Allocation* AllocateImpl(size_t size) override; - uint64_t ReleaseImpl(const platform::Place& place) override { + void FreeImpl(Allocation *allocation) override; + Allocation *AllocateImpl(size_t size) override; + uint64_t ReleaseImpl(const platform::Place &place) override { return underlying_allocator_->Release(place); } @@ -60,6 +60,64 @@ class RetryAllocator : public Allocator { std::atomic waited_allocate_size_{0}; }; +class SampleAllocator : public Allocator { + /** + * Descriptor for device memory allocations + */ + struct BlockDescriptor { + Allocation *d_ptr; // Device pointer + size_t bytes; // Size of allocation in bytes + size_t used; // Real used + unsigned int bin; // Bin enumeration + + explicit BlockDescriptor(Allocation *ptr); + BlockDescriptor(); + static bool ptrcompare(const BlockDescriptor &a, const BlockDescriptor &b); + static bool sizecompare(const BlockDescriptor &a, const BlockDescriptor &b); + }; + // BlockDescriptor comparator function interface + typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &); + /// Set type for cached blocks (ordered by size) + typedef std::multiset CachedBlocks; + /// Set type for live blocks (ordered by ptr) + typedef std::multiset BusyBlocks; + + public: + // Total Bytes + struct TotalBytes { + size_t free = 0; + size_t live = 0; + size_t used = 0; + }; + explicit SampleAllocator(std::shared_ptr allocator); + bool IsAllocThreadSafe() const override { return true; } + void GetMemInfo(TotalBytes *info); + + protected: + void FreeImpl(Allocation *allocation) override; + Allocation *AllocateImpl(size_t size) override; + uint64_t ReleaseImpl(const platform::Place &place) override { + FreeAllCache(); + return allocator_->Release(place); + } + void FreeAllCache(void); + + private: + std::shared_ptr allocator_; + std::mutex mutex_; + + unsigned int bin_growth_; /// Geometric growth factor for bin-sizes + unsigned int min_bin_; /// Minimum bin enumeration + size_t min_bin_bytes_; + size_t max_bin_bytes_; + + TotalBytes cached_bytes_; /// Map of device ordinal to aggregate cached bytes + /// on that device + CachedBlocks + cached_blocks_; /// Set of cached device allocations available for reuse + BusyBlocks live_blocks_; +}; + } // namespace allocation } // namespace memory } // namespace paddle diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 1702c7cbc84d8..b0d18e54139e3 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -204,6 +204,10 @@ def __bootstrap__(): 'call_stack_level', 'sort_sum_gradient', 'max_inplace_grad_add', + 'sample_max_bin_bytes', + 'sample_bin_growth', + 'sample_min_bin', + 'sample_debug_info', ] if 'Darwin' not in sysstr: read_env_flags.append('use_pinned_memory')