From 7299e7cd7c1a9fbb80055a89e6359ecb79d8a4de Mon Sep 17 00:00:00 2001
From: rdeodhar <rajiv.deodhar@intel.com>
Date: Thu, 25 Mar 2021 15:44:17 -0700
Subject: [PATCH 01/16] [SYCL] Pooling of USM memory allocated for buffers.

---
 sycl/plugins/level_zero/pi_level_zero.cpp | 100 +++++++----
 sycl/plugins/level_zero/pi_level_zero.hpp |  17 +-
 sycl/plugins/level_zero/usm_allocator.cpp | 207 +++++++++++++++++++---
 sycl/plugins/level_zero/usm_allocator.hpp |   1 +
 4 files changed, 269 insertions(+), 56 deletions(-)

diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp
index 5b46a9a00eb16..0bdf58470f97b 100644
--- a/sycl/plugins/level_zero/pi_level_zero.cpp
+++ b/sycl/plugins/level_zero/pi_level_zero.cpp
@@ -2293,7 +2293,6 @@ pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size,
   }
 
   void *Ptr;
-  ze_device_handle_t ZeDevice = Context->Devices[0]->ZeDevice;
 
   // We treat integrated devices (physical memory shared with the CPU)
   // differently from discrete devices (those with distinct memories).
@@ -2313,20 +2312,15 @@ pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size,
     //
   }
 
+  pi_result Result;
   if (DeviceIsIntegrated) {
-    ze_host_mem_alloc_desc_t ZeDesc = {};
-    ZeDesc.flags = 0;
-
-    ZE_CALL(zeMemAllocHost, (Context->ZeContext, &ZeDesc, Size, 1, &Ptr));
-
+    Result = piextUSMHostAlloc(&Ptr, Context, nullptr, Size, 64);
   } else {
-    ze_device_mem_alloc_desc_t ZeDesc = {};
-    ZeDesc.flags = 0;
-    ZeDesc.ordinal = 0;
-
-    ZE_CALL(zeMemAllocDevice,
-            (Context->ZeContext, &ZeDesc, Size, 1, ZeDevice, &Ptr));
+    Result = piextUSMDeviceAlloc(&Ptr, Context, Context->Devices[0], nullptr,
+                                 Size, 64);
   }
+  if (Result != PI_SUCCESS)
+    return Result;
 
   if (HostPtr) {
     if ((Flags & PI_MEM_FLAGS_HOST_PTR_USE) != 0 ||
@@ -2394,7 +2388,7 @@ pi_result piMemRelease(pi_mem Mem) {
     } else {
       auto Buf = static_cast<_pi_buffer *>(Mem);
       if (!Buf->isSubBuffer()) {
-        ZE_CALL(zeMemFree, (Mem->Context->ZeContext, Mem->getZeHandle()));
+        piextUSMFree(Mem->Context, Mem->getZeHandle());
       }
     }
     delete Mem;
@@ -5152,28 +5146,6 @@ pi_result piextGetDeviceFunctionPointer(pi_device Device, pi_program Program,
   return mapError(ZeResult);
 }
 
-pi_result piextUSMHostAlloc(void **ResultPtr, pi_context Context,
-                            pi_usm_mem_properties *Properties, size_t Size,
-                            pi_uint32 Alignment) {
-  PI_ASSERT(Context, PI_INVALID_CONTEXT);
-
-  // Check that incorrect bits are not set in the properties.
-  PI_ASSERT(!Properties || (Properties && !(*Properties & ~PI_MEM_ALLOC_FLAGS)),
-            PI_INVALID_VALUE);
-
-  ze_host_mem_alloc_desc_t ZeDesc = {};
-  ZeDesc.flags = 0;
-  // TODO: translate PI properties to Level Zero flags
-  ZE_CALL(zeMemAllocHost,
-          (Context->ZeContext, &ZeDesc, Size, Alignment, ResultPtr));
-
-  PI_ASSERT(Alignment == 0 ||
-                reinterpret_cast<std::uintptr_t>(*ResultPtr) % Alignment == 0,
-            PI_INVALID_VALUE);
-
-  return PI_SUCCESS;
-}
-
 static bool ShouldUseUSMAllocator() {
   // Enable allocator by default if it's not explicitly disabled
   return std::getenv("SYCL_PI_LEVEL_ZERO_DISABLE_USM_ALLOCATOR") == nullptr;
@@ -5233,6 +5205,28 @@ pi_result USMSharedAllocImpl(void **ResultPtr, pi_context Context,
   return PI_SUCCESS;
 }
 
+pi_result USMHostAllocImpl(void **ResultPtr, pi_context Context,
+                           pi_usm_mem_properties *Properties, size_t Size,
+                           pi_uint32 Alignment) {
+  PI_ASSERT(Context, PI_INVALID_CONTEXT);
+
+  // Check that incorrect bits are not set in the properties.
+  PI_ASSERT(!Properties || (Properties && !(*Properties & ~PI_MEM_ALLOC_FLAGS)),
+            PI_INVALID_VALUE);
+
+  // TODO: translate PI properties to Level Zero flags
+  ze_host_mem_alloc_desc_t ZeHostDesc = {};
+  ZeHostDesc.flags = 0;
+  ZE_CALL(zeMemAllocHost,
+          (Context->ZeContext, &ZeHostDesc, Size, Alignment, ResultPtr));
+
+  PI_ASSERT(Alignment == 0 ||
+                reinterpret_cast<std::uintptr_t>(*ResultPtr) % Alignment == 0,
+            PI_INVALID_VALUE);
+
+  return PI_SUCCESS;
+}
+
 pi_result USMFreeImpl(pi_context Context, void *Ptr) {
   ZE_CALL(zeMemFree, (Context->ZeContext, Ptr));
   return PI_SUCCESS;
@@ -5259,6 +5253,11 @@ pi_result USMDeviceMemoryAlloc::allocateImpl(void **ResultPtr, size_t Size,
                             Alignment);
 }
 
+pi_result USMHostMemoryAlloc::allocateImpl(void **ResultPtr, size_t Size,
+                                           pi_uint32 Alignment) {
+  return USMHostAllocImpl(ResultPtr, Context, nullptr, Size, Alignment);
+}
+
 void *USMMemoryAllocBase::allocate(size_t Size) {
   void *Ptr = nullptr;
 
@@ -5341,6 +5340,27 @@ pi_result piextUSMSharedAlloc(void **ResultPtr, pi_context Context,
   return PI_SUCCESS;
 }
 
+pi_result piextUSMHostAlloc(void **ResultPtr, pi_context Context,
+                            pi_usm_mem_properties *Properties, size_t Size,
+                            pi_uint32 Alignment) {
+  if (!UseUSMAllocator ||
+      // L0 spec says that allocation fails if Alignment != 2^n, in order to
+      // keep the same behavior for the allocator, just call L0 API directly and
+      // return the error code.
+      ((Alignment & (Alignment - 1)) != 0)) {
+    return USMHostAllocImpl(ResultPtr, Context, Properties, Size, Alignment);
+  }
+
+  try {
+    *ResultPtr = Context->HostMemAllocContext->allocate(Size, Alignment);
+  } catch (const UsmAllocationException &Ex) {
+    *ResultPtr = nullptr;
+    return Ex.getError();
+  }
+
+  return PI_SUCCESS;
+}
+
 pi_result piextUSMFree(pi_context Context, void *Ptr) {
   if (!UseUSMAllocator) {
     return USMFreeImpl(Context, Ptr);
@@ -5356,6 +5376,16 @@ pi_result piextUSMFree(pi_context Context, void *Ptr) {
           (Context->ZeContext, Ptr, &ZeMemoryAllocationProperties,
            &ZeDeviceHandle));
 
+  // If memory type is host release from host pool
+  if (ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_HOST) {
+    try {
+      Context->HostMemAllocContext->deallocate(Ptr);
+    } catch (const UsmAllocationException &Ex) {
+      return Ex.getError();
+    }
+    return PI_SUCCESS;
+  }
+
   if (ZeDeviceHandle) {
     // All devices in the context are of the same platform.
     auto Platform = Context->Devices[0]->Platform;
diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp
index 724cb7b599022..ed2e69f05d1ff 100644
--- a/sycl/plugins/level_zero/pi_level_zero.hpp
+++ b/sycl/plugins/level_zero/pi_level_zero.hpp
@@ -134,6 +134,16 @@ class USMDeviceMemoryAlloc : public USMMemoryAllocBase {
       : USMMemoryAllocBase(Ctx, Dev) {}
 };
 
+// Allocation routines for host memory type
+class USMHostMemoryAlloc : public USMMemoryAllocBase {
+protected:
+  pi_result allocateImpl(void **ResultPtr, size_t Size,
+                         pi_uint32 Alignment) override;
+
+public:
+  USMHostMemoryAlloc(pi_context Ctx) : USMMemoryAllocBase(Ctx, nullptr) {}
+};
+
 struct _pi_device : _pi_object {
   _pi_device(ze_device_handle_t Device, pi_platform Plt,
              bool isSubDevice = false)
@@ -196,6 +206,9 @@ struct _pi_context : _pi_object {
       // NOTE: one must additionally call initialize() to complete
       // PI context creation.
     }
+    // Create USM allocator context for host
+    HostMemAllocContext = new USMAllocContext(
+        std::unique_ptr<SystemMemory>(new USMHostMemoryAlloc(this)));
   }
 
   // Initialize the PI context.
@@ -257,10 +270,12 @@ struct _pi_context : _pi_object {
   pi_result decrementAliveEventsInPool(ze_event_pool_handle_t pool);
 
   // Store USM allocator context(internal allocator structures)
-  // for USM shared/host and device allocations. There is 1 allocator context
+  // for USM shared and device allocations. There is 1 allocator context
   // per each pair of (context, device) per each memory type.
   std::unordered_map<pi_device, USMAllocContext> SharedMemAllocContexts;
   std::unordered_map<pi_device, USMAllocContext> DeviceMemAllocContexts;
+  // Store the host allocator context. It does not depend on any device.
+  USMAllocContext *HostMemAllocContext;
 
 private:
   // Following member variables are used to manage assignment of events
diff --git a/sycl/plugins/level_zero/usm_allocator.cpp b/sycl/plugins/level_zero/usm_allocator.cpp
index 1596bf71bb427..9425f93e97dcc 100644
--- a/sycl/plugins/level_zero/usm_allocator.cpp
+++ b/sycl/plugins/level_zero/usm_allocator.cpp
@@ -20,18 +20,24 @@
 #include <vector>
 
 #include "usm_allocator.hpp"
+#include <iostream>
 
 namespace settings {
-// Size of the slab which is going to be requested from the system.
-static constexpr size_t SlabSize = 64 * 1024; // 64K
+// Minimum allocation size that will be requested from the system.
+static constexpr size_t SlabSize = 64 * 1024; // 64KB
+
+// Allocations <= ChunkCutOff will use chunks from individual slabs.
+// Allocations >  ChunkCutOff will be rounded up to a multiple of
+// SlabSize and allocated to occupy the whole slab.
+static constexpr size_t ChunkCutOff = SlabSize / 2;
 // The largest size which is allocated via the allocator.
 // Allocations with size > CutOff bypass the USM allocator and
 // go directly to the runtime.
-static constexpr size_t CutOff = SlabSize / 2;
+static constexpr size_t CutOff = (size_t)1 << 31; // 2GB
 
 // Unfortunately we cannot deduce the size of the array, so every change
 // to the number of buckets should be reflected here.
-using BucketsArrayType = std::array<size_t, 21>;
+using BucketsArrayType = std::array<size_t, 53>;
 
 // Generates a list of bucket sizes used by the allocator.
 static constexpr BucketsArrayType generateBucketSizes() {
@@ -41,7 +47,7 @@ static constexpr BucketsArrayType generateBucketSizes() {
 // allow to write this in a nicer way.
 
 // Simple helper to compute power of 2
-#define P(n) (1 << n)
+#define P(n) (1 << (size_t)n)
 
   BucketsArrayType Sizes = {32,    48,
                             64,    96,
@@ -53,6 +59,22 @@ static constexpr BucketsArrayType generateBucketSizes() {
                             P(12), P(12) + P(11),
                             P(13), P(13) + P(12),
                             P(14), P(14) + P(13),
+                            P(15), P(15) + P(14),
+                            P(16), P(16) + P(15),
+                            P(17), P(17) + P(16),
+                            P(18), P(18) + P(17),
+                            P(19), P(19) + P(18),
+                            P(20), P(20) + P(19),
+                            P(21), P(21) + P(20),
+                            P(22), P(22) + P(21),
+                            P(23), P(23) + P(22),
+                            P(24), P(24) + P(23),
+                            P(25), P(25) + P(24),
+                            P(26), P(26) + P(25),
+                            P(27), P(27) + P(26),
+                            P(28), P(28) + P(27),
+                            P(29), P(29) + P(28),
+                            P(30), P(30) + P(29),
                             CutOff};
 #undef P
 
@@ -64,6 +86,60 @@ static constexpr BucketsArrayType BucketSizes = generateBucketSizes();
 // The implementation expects that SlabSize is 2^n
 static_assert((SlabSize & (SlabSize - 1)) == 0,
               "SlabSize must be a power of 2");
+
+static size_t MaxPoolableSize = 1024;
+static size_t MaxPoolSize = 4096;
+static size_t Capacity = 4;
+static size_t CurPoolSize = 0;
+
+// Protects the capacity checking of the pool.
+static std::mutex PoolLock;
+
+static bool Trace = false;
+static class SetLimits {
+public:
+  SetLimits() {
+    // Parse optional parameters of this form (applicable to each context):
+    // SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR_SETTINGS=[<MaxPoolableSize>][,[<Capacity>][,[<MaxPoolSize>]]]
+    // MaxPoolableSize: Maximum poolable allocation size, specified in MB.
+    //                  Default 1GB.
+    // Capacity:        Number of pooled allocations in each bucket.
+    //                  Default 4.
+    // MaxPoolSize:     Maximum size of pool, specified in MB.
+    //                  Default 4GB.
+
+    char *PoolParams = getenv("SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR_SETTINGS");
+    Trace = getenv("SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR_TRACE") != nullptr;
+    if (PoolParams != nullptr) {
+      std::string Params(PoolParams);
+      size_t Pos = Params.find(',');
+      if (Pos != std::string::npos) {
+        if (Pos > 0)
+          MaxPoolableSize = std::stoi(Params.substr(0, Pos));
+        Params.erase(0, Pos + 1);
+        Pos = Params.find(',');
+        if (Pos != std::string::npos) {
+          if (Pos > 0)
+            Capacity = std::stoi(Params.substr(0, Pos));
+          Params.erase(0, Pos + 1);
+          if (Pos != std::string::npos)
+            MaxPoolSize = std::stoi(Params);
+        } else {
+          Capacity = std::stoi(Params);
+        }
+      } else
+        MaxPoolableSize = std::stoi(Params);
+    }
+    MaxPoolableSize *= (1 << 20);
+    MaxPoolSize *= (1 << 20);
+
+    if (Trace) {
+      std::cout << "MaxPoolableSize = " << MaxPoolableSize << std::endl;
+      std::cout << "Capacity        = " << Capacity << std::endl;
+      std::cout << "MaxPoolSize     = " << MaxPoolSize << std::endl;
+    }
+  }
+} L;
 } // namespace settings
 
 // Aligns the pointer down to the specified alignment
@@ -91,6 +167,18 @@ static size_t AlignUp(size_t Val, size_t Alignment) {
   return (Val + Alignment - 1) & (~(Alignment - 1));
 }
 
+// Check whether a freed allocation can be retained in the pool or must be
+// returned to the system.
+static bool PoolHasCapacity(size_t Size) {
+  std::lock_guard<std::mutex> PL(settings::PoolLock);
+  size_t NewPoolSize = settings::CurPoolSize + Size;
+  if (settings::MaxPoolSize >= NewPoolSize) {
+    settings::CurPoolSize = NewPoolSize;
+    return true;
+  }
+  return false;
+}
+
 class Bucket;
 
 // Represents the allocated memory block of size 'settings::SlabSize'
@@ -143,6 +231,7 @@ class Slab {
   size_t getNumAllocated() const { return NumAllocated; }
 
   void *getFreeChunk();
+  void *getFullSlab();
 
   void *getPtr() const { return MemPtr; }
   void *getEnd() const {
@@ -181,16 +270,22 @@ class Bucket {
       : Size{Sz}, OwnAllocCtx{AllocCtx} {}
 
   void *getChunk();
+  void *getSlab();
 
   size_t getSize() const { return Size; }
 
   void freeChunk(void *Ptr, Slab &Slab);
+  void freeSlab(void *Ptr, Slab &Slab);
+
   SystemMemory &getMemHandle();
   USMAllocContext::USMAllocImpl &getUsmAllocCtx() { return OwnAllocCtx; }
 
+  bool BucketHasCapacity();
+
 private:
   void onFreeChunk(Slab &);
   decltype(AvailableSlabs.begin()) getAvailSlab();
+  decltype(AvailableSlabs.begin()) getAvailFullSlab();
 };
 
 class USMAllocContext::USMAllocImpl {
@@ -242,12 +337,15 @@ std::ostream &operator<<(std::ostream &Os, const Slab &Slab) {
 }
 
 Slab::Slab(Bucket &Bkt)
-    : MemPtr(Bkt.getMemHandle().allocate(settings::SlabSize)),
+    : // MemPtr(Bkt.getMemHandle().allocate(Bkt.getSize())),
       // In case if bucket size is not that SlabSize % b.getSize() == 0, we
       // would have some padding at the end of the slab.
       Chunks(settings::SlabSize / Bkt.getSize()), NumAllocated{0},
       bucket(Bkt), SlabListIter{}, FirstFreeChunkIdx{0} {
-
+  size_t SlabAllocSize = Bkt.getSize();
+  if (SlabAllocSize < settings::SlabSize)
+    SlabAllocSize = settings::SlabSize;
+  MemPtr = Bkt.getMemHandle().allocate(SlabAllocSize);
   regSlab(*this);
 }
 
@@ -286,6 +384,8 @@ void *Slab::getFreeChunk() {
   return FreeChunk;
 }
 
+void *Slab::getFullSlab() { return getPtr(); }
+
 Bucket &Slab::getBucket() { return bucket; }
 const Bucket &Slab::getBucket() const { return bucket; }
 
@@ -359,6 +459,44 @@ void Slab::freeChunk(void *Ptr) {
 
 bool Slab::hasAvail() { return NumAllocated != getNumChunks(); }
 
+auto Bucket::getAvailFullSlab() -> decltype(AvailableSlabs.begin()) {
+  if (AvailableSlabs.size() == 0) {
+    auto It = AvailableSlabs.insert(AvailableSlabs.begin(),
+                                    std::make_unique<Slab>(*this));
+    (*It)->setIterator(It);
+  } else {
+    settings::CurPoolSize -= getSize();
+  }
+
+  return AvailableSlabs.begin();
+}
+
+void *Bucket::getSlab() {
+  std::lock_guard<std::mutex> Lg(BucketLock);
+
+  auto SlabIt = getAvailFullSlab();
+  auto *FreeSlab = (*SlabIt)->getFullSlab();
+  auto It =
+      UnavailableSlabs.insert(UnavailableSlabs.begin(), std::move(*SlabIt));
+  AvailableSlabs.erase(SlabIt);
+  (*It)->setIterator(It);
+  return FreeSlab;
+}
+
+void Bucket::freeSlab(void *Ptr, Slab &Slab) {
+  std::lock_guard<std::mutex> Lg(BucketLock);
+  auto SlabIter = Slab.getIterator();
+  assert(SlabIter != UnavailableSlabs.end());
+  if (PoolHasCapacity(getSize()) && BucketHasCapacity()) {
+    auto It =
+        AvailableSlabs.insert(AvailableSlabs.begin(), std::move(*SlabIter));
+    UnavailableSlabs.erase(SlabIter);
+    (*It)->setIterator(It);
+  } else {
+    UnavailableSlabs.erase(SlabIter);
+  }
+}
+
 auto Bucket::getAvailSlab() -> decltype(AvailableSlabs.begin()) {
   if (AvailableSlabs.size() == 0) {
     auto It = AvailableSlabs.insert(AvailableSlabs.begin(),
@@ -409,15 +547,29 @@ void Bucket::onFreeChunk(Slab &Slab) {
     (*It)->setIterator(It);
   }
 
-  // Remove the slab when all the chunks from it are deallocated
-  // Note: since the slab is stored as unique_ptr, just remove it from
-  // the list to remove the list to destroy the object
+  // If slab has no chunks allocated we could pool it if capacity is available
+  // or release it to the system.
   if (Slab.getNumAllocated() == 0) {
-    auto It = Slab.getIterator();
-    assert(It != AvailableSlabs.end());
+    // Pool has no space so release it.
+    if (!(PoolHasCapacity(getSize()) && BucketHasCapacity())) {
+      // Remove the slab when all the chunks from it are deallocated
+      // Note: since the slab is stored as unique_ptr, just remove it from
+      // the list to remove the list to destroy the object
+      auto It = Slab.getIterator();
+      assert(It != AvailableSlabs.end());
+
+      AvailableSlabs.erase(It);
+    }
+  }
+}
 
-    AvailableSlabs.erase(It);
+// Check whether a bucket has capacity for retaining a freed allocation.
+bool Bucket::BucketHasCapacity() {
+  size_t NewFreeSlabsInBucket = AvailableSlabs.size() + 1;
+  if (settings::Capacity >= NewFreeSlabsInBucket) {
+    return true;
   }
+  return false;
 }
 
 SystemMemory &Bucket::getMemHandle() { return OwnAllocCtx.getMemHandle(); }
@@ -426,10 +578,16 @@ void *USMAllocContext::USMAllocImpl::allocate(size_t Size) {
   if (Size == 0)
     return nullptr;
 
-  if (Size > settings::CutOff)
+  if (Size > settings::MaxPoolableSize) {
     return getMemHandle().allocate(Size);
+  }
+
+  auto &Bucket = findBucket(Size);
+  if (Size > settings::ChunkCutOff) {
+    return Bucket.getSlab();
+  }
 
-  return findBucket(Size).getChunk();
+  return Bucket.getChunk();
 }
 
 void *USMAllocContext::USMAllocImpl::allocate(size_t Size, size_t Alignment) {
@@ -441,13 +599,19 @@ void *USMAllocContext::USMAllocImpl::allocate(size_t Size, size_t Alignment) {
 
   size_t AlignedSize = (Size > 1) ? AlignUp(Size, Alignment) : Alignment;
 
-  // Check if our largest chunk is able to fit aligned size.
+  // Check if requested allocation size is within pooling limit.
   // If not, just request aligned pointer from the system.
-  if (AlignedSize > settings::CutOff) {
+  if (AlignedSize > settings::MaxPoolableSize) {
     return getMemHandle().allocate(Size, Alignment);
   }
 
-  auto *Ptr = findBucket(AlignedSize).getChunk();
+  void *Ptr;
+  auto &Bucket = findBucket(AlignedSize);
+  if (AlignedSize > settings::ChunkCutOff) {
+    Ptr = Bucket.getSlab();
+  } else {
+    Ptr = Bucket.getChunk();
+  }
   return AlignPtrUp(Ptr, Alignment);
 }
 
@@ -481,12 +645,15 @@ void USMAllocContext::USMAllocImpl::deallocate(void *Ptr) {
     // protected by the lock, so it's safe to access it here.
     auto &Slab = It->second;
     if (Ptr >= Slab.getPtr() && Ptr < Slab.getEnd()) {
-
       // Unlock the map before freeing the chunk, it may be locked on write
       // there
       Lk.unlock();
       auto &Bucket = Slab.getBucket();
-      Bucket.freeChunk(Ptr, Slab);
+      if (Bucket.getSize() <= settings::ChunkCutOff) {
+        Bucket.freeChunk(Ptr, Slab);
+      } else {
+        Bucket.freeSlab(Ptr, Slab);
+      }
       return;
     }
   }
diff --git a/sycl/plugins/level_zero/usm_allocator.hpp b/sycl/plugins/level_zero/usm_allocator.hpp
index b72ca77d41538..42e15e6987e7f 100644
--- a/sycl/plugins/level_zero/usm_allocator.hpp
+++ b/sycl/plugins/level_zero/usm_allocator.hpp
@@ -31,6 +31,7 @@ class USMAllocContext {
   void *allocate(size_t size);
   void *allocate(size_t size, size_t alignment);
   void deallocate(void *ptr);
+  void printStats();
 
 private:
   std::unique_ptr<USMAllocImpl> pImpl;

From 835fec37ee6c5f8180a92e21853fdda302fd109e Mon Sep 17 00:00:00 2001
From: rdeodhar <rajiv.deodhar@intel.com>
Date: Thu, 25 Mar 2021 16:06:51 -0700
Subject: [PATCH 02/16] Removed dead code.

---
 sycl/plugins/level_zero/usm_allocator.hpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sycl/plugins/level_zero/usm_allocator.hpp b/sycl/plugins/level_zero/usm_allocator.hpp
index 42e15e6987e7f..b72ca77d41538 100644
--- a/sycl/plugins/level_zero/usm_allocator.hpp
+++ b/sycl/plugins/level_zero/usm_allocator.hpp
@@ -31,7 +31,6 @@ class USMAllocContext {
   void *allocate(size_t size);
   void *allocate(size_t size, size_t alignment);
   void deallocate(void *ptr);
-  void printStats();
 
 private:
   std::unique_ptr<USMAllocImpl> pImpl;

From abcffba08825b4529c7e200d91c2a05a3e9876ef Mon Sep 17 00:00:00 2001
From: rdeodhar <rajiv.deodhar@intel.com>
Date: Thu, 25 Mar 2021 20:30:44 -0700
Subject: [PATCH 03/16] Added test.

---
 sycl/test/on-device/usm_pooling.cpp | 125 ++++++++++++++++++++++++++++
 1 file changed, 125 insertions(+)
 create mode 100755 sycl/test/on-device/usm_pooling.cpp

diff --git a/sycl/test/on-device/usm_pooling.cpp b/sycl/test/on-device/usm_pooling.cpp
new file mode 100755
index 0000000000000..430a953eefa80
--- /dev/null
+++ b/sycl/test/on-device/usm_pooling.cpp
@@ -0,0 +1,125 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+
+// Allocate 2 items of 2MB. Free 2. Allocate 3 more of 2MB.
+
+// With no pooling:       1, 2, 3, 4, 5 allocs lead to ZE call.
+// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_DISABLE_USM_ALLOCATOR=1 %t.out h 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-NOPOOL
+// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_DISABLE_USM_ALLOCATOR=1 %t.out d 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-NOPOOL
+// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_DISABLE_USM_ALLOCATOR=1 %t.out s 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-NOPOOL
+
+// With pooling enabled and MaxPooolable=1MB: 1, 2, 3, 4, 5 allocs lead to ZE call.
+// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR_SETTINGS=1 %t.out h 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-12345
+// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR_SETTINGS=1 %t.out d 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-12345
+// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR_SETTINGS=1 %t.out s 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-12345
+
+// With With pooling enabled and capacity=1:       1, 2, 4, 5 allocs lead to ZE call.
+// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR_SETTINGS=,1 %t.out h 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-1245
+// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR_SETTINGS=,1 %t.out d 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-1245
+// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR_SETTINGS=,1 %t.out s 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-1245
+
+// With With pooling enabled and MaxPoolSize=2MB:  1, 2, 4, 5 allocs lead to ZE call.
+// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR_SETTINGS=,,2 %t.out h 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-1245
+// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR_SETTINGS=,,2 %t.out d 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-1245
+// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR_SETTINGS=,,2 %t.out s 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-1245
+
+#include "CL/sycl.hpp"
+using namespace sycl;
+
+constexpr size_t SIZE = 2 * 1024 * 1024;
+
+void test_host(context C) {
+
+  void* ph1 = malloc_host(SIZE, C);
+  void* ph2 = malloc_host(SIZE, C);
+  free(ph1, C);
+  free(ph2, C);
+  void* ph3 = malloc_host(SIZE, C);
+  void* ph4 = malloc_host(SIZE, C);
+  void* ph5 = malloc_host(SIZE, C);
+  free(ph3, C);
+  free(ph4, C);
+  free(ph5, C);
+}
+
+void test_device(context C, device D) {
+
+  void* ph1 = malloc_device(SIZE, D, C);
+  void* ph2 = malloc_device(SIZE, D, C);
+  free(ph1, C);
+  free(ph2, C);
+  void* ph3 = malloc_device(SIZE, D, C);
+  void* ph4 = malloc_device(SIZE, D, C);
+  void* ph5 = malloc_device(SIZE, D, C);
+  free(ph3, C);
+  free(ph4, C);
+  free(ph5, C);
+}
+
+void test_shared(context C, device D) {
+
+  void* ph1 = malloc_shared(SIZE, D, C);
+  void* ph2 = malloc_shared(SIZE, D, C);
+  free(ph1, C);
+  free(ph2, C);
+  void* ph3 = malloc_shared(SIZE, D, C);
+  void* ph4 = malloc_shared(SIZE, D, C);
+  void* ph5 = malloc_shared(SIZE, D, C);
+  free(ph3, C);
+  free(ph4, C);
+  free(ph5, C);
+}
+
+int main(int argc, char *argv[]) {
+  queue Q;
+  device D = Q.get_device();
+  context C = Q.get_context();
+
+  const char *devType = D.is_host() ? "Host" : D.is_cpu() ? "CPU" : "GPU";
+  std::string pluginName =
+      D.get_platform().get_info<sycl::info::platform::name>();
+  std::cout << "Running on device " << devType << " ("
+            << D.get_info<cl::sycl::info::device::name>() << ") " << pluginName
+            << " plugin\n";
+
+  if (*argv[1] == 'h') {
+    std::cerr << "Test zeMemAllocHost\n";
+    test_host(C);
+  } else if (*argv[1] == 'd') {
+    std::cerr << "Test zeMemAllocDevice\n";
+    test_device(C, D);
+  } else if (*argv[1] == 's') {
+    std::cerr << "Test zeMemAllocShared\n";
+    test_shared(C, D);
+  }
+
+  return 0;
+}
+
+// CHECK-NOPOOL: Test [[API:zeMemAllocHost|zeMemAllocDevice|zeMemAllocShared]]
+// CHECK-NOPOOL-NEXT:  ZE ---> [[API]](
+// CHECK-NOPOOL-NEXT:  ZE ---> [[API]](
+// CHECK-NOPOOL-NEXT:  ZE ---> zeMemFree
+// CHECK-NOPOOL-NEXT:  ZE ---> zeMemFree
+// CHECK-NOPOOL-NEXT:  ZE ---> [[API]](
+// CHECK-NOPOOL-NEXT:  ZE ---> [[API]](
+// CHECK-NOPOOL-NEXT:  ZE ---> [[API]](
+
+// CHECK-12345: Test [[API:zeMemAllocHost|zeMemAllocDevice|zeMemAllocShared]]
+// CHECK-12345-NEXT:  ZE ---> [[API]](
+// CHECK-12345-NEXT:  ZE ---> [[API]](
+// CHECK-12345-NEXT:  ZE ---> zeMemGetAllocProperties
+// CHECK-12345-NEXT:  ZE ---> zeMemFree
+// CHECK-12345-NEXT:  ZE ---> zeMemGetAllocProperties
+// CHECK-12345-NEXT:  ZE ---> zeMemFree
+// CHECK-12345-NEXT:  ZE ---> [[API]](
+// CHECK-12345-NEXT:  ZE ---> [[API]](
+// CHECK-12345-NEXT:  ZE ---> [[API]](
+
+// CHECK-1245: Test [[API:zeMemAllocHost|zeMemAllocDevice|zeMemAllocShared]]
+// CHECK-1245-NEXT:  ZE ---> [[API]](
+// CHECK-1245-NEXT:  ZE ---> [[API]](
+// CHECK-1245-NEXT:  ZE ---> zeMemGetAllocProperties
+// CHECK-1245-NEXT:  ZE ---> zeMemGetAllocProperties
+// CHECK-1245-NEXT:  ZE ---> zeMemFree
+// CHECK-1245-NEXT:  ZE ---> [[API]](
+// CHECK-1245-NEXT:  ZE ---> [[API]](

From dd0b5c66a698b028561ee234d445918fcf812d6c Mon Sep 17 00:00:00 2001
From: rdeodhar <rajiv.deodhar@intel.com>
Date: Thu, 25 Mar 2021 21:36:59 -0700
Subject: [PATCH 04/16] Restrict test to level_zero.

---
 sycl/test/on-device/usm_pooling.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sycl/test/on-device/usm_pooling.cpp b/sycl/test/on-device/usm_pooling.cpp
index 430a953eefa80..6131957bb810a 100755
--- a/sycl/test/on-device/usm_pooling.cpp
+++ b/sycl/test/on-device/usm_pooling.cpp
@@ -1,3 +1,5 @@
+// REQUIRES: level_zero
+
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 
 // Allocate 2 items of 2MB. Free 2. Allocate 3 more of 2MB.

From b47c2960e11c5e338d7b5bfe1dd131efdc414da3 Mon Sep 17 00:00:00 2001
From: rdeodhar <rajiv.deodhar@intel.com>
Date: Tue, 30 Mar 2021 16:58:01 -0700
Subject: [PATCH 05/16] Changed defaults; other review comments.

---
 sycl/doc/EnvironmentVariables.md          |   1 +
 sycl/plugins/level_zero/pi_level_zero.cpp |  15 +++-
 sycl/plugins/level_zero/usm_allocator.cpp | 104 ++++++++++------------
 3 files changed, 63 insertions(+), 57 deletions(-)

diff --git a/sycl/doc/EnvironmentVariables.md b/sycl/doc/EnvironmentVariables.md
index f005abca796bd..1352620d982ea 100644
--- a/sycl/doc/EnvironmentVariables.md
+++ b/sycl/doc/EnvironmentVariables.md
@@ -28,6 +28,7 @@ subject to change. Do not rely on these variables in production code.
 | SYCL_DEVICELIB_NO_FALLBACK | Any(\*) | Disable loading and linking of device library images |
 | SYCL_PI_LEVEL_ZERO_MAX_COMMAND_LIST_CACHE | Positive integer | Maximum number of oneAPI Level Zero Command lists that can be allocated with no reuse before throwing an "out of resources" error. Default is 20000, threshold may be increased based on resource availabilty and workload demand. |
 | SYCL_PI_LEVEL_ZERO_DISABLE_USM_ALLOCATOR | Any(\*) | Disable USM allocator in Level Zero plugin (each memory request will go directly to Level Zero runtime) |
+| SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR | MaxPoolableSize,Capacity,MaxPoolSize | Maximum allocation size in MB that may be pooled, number of pooled allocations in each size bucket, maximum size of pool in MB. Defaults are 1, 4, 256. |
 | SYCL_PI_LEVEL_ZERO_BATCH_SIZE | Integer | Sets a preferred number of commands to batch into a command list before executing the command list. A value of 0 causes the batch size to be adjusted dynamically. A value greater than 0 specifies fixed size batching, with the batch size set to the specified value. The default is 0. |
 | SYCL_PI_LEVEL_ZERO_FILTER_EVENT_WAIT_LIST | Integer | When set to 0, disables filtering of signaled events from wait lists when using the Level Zero backend. The default is 1. |
 | SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE | Integer | Allows the use of copy engine, if available in the device, in Level Zero plugin to transfer SYCL buffer or image data between the host and/or device(s) and to fill SYCL buffer or image data in device or shared memory. The default is 1. |
diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp
index 77085b23df6ee..e291fbc4243ea 100644
--- a/sycl/plugins/level_zero/pi_level_zero.cpp
+++ b/sycl/plugins/level_zero/pi_level_zero.cpp
@@ -2444,11 +2444,14 @@ pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size,
   }
 
   pi_result Result;
+  auto Alignment = std::min(Size, 64UL);
+  if ((Alignment & (Alignment - 1)) != 0)
+    Alignment = 1;
   if (DeviceIsIntegrated) {
-    Result = piextUSMHostAlloc(&Ptr, Context, nullptr, Size, 64);
+    Result = piextUSMHostAlloc(&Ptr, Context, nullptr, Size, Alignment);
   } else {
     Result = piextUSMDeviceAlloc(&Ptr, Context, Context->Devices[0], nullptr,
-                                 Size, 64);
+                                 Size, Alignment);
   }
   if (Result != PI_SUCCESS)
     return Result;
@@ -5451,6 +5454,8 @@ pi_result piextUSMDeviceAlloc(void **ResultPtr, pi_context Context,
   } catch (const UsmAllocationException &Ex) {
     *ResultPtr = nullptr;
     return Ex.getError();
+  } catch (...) {
+    return PI_ERROR_UNKNOWN;
   }
 
   return PI_SUCCESS;
@@ -5478,6 +5483,8 @@ pi_result piextUSMSharedAlloc(void **ResultPtr, pi_context Context,
   } catch (const UsmAllocationException &Ex) {
     *ResultPtr = nullptr;
     return Ex.getError();
+  } catch (...) {
+    return PI_ERROR_UNKNOWN;
   }
 
   return PI_SUCCESS;
@@ -5499,6 +5506,8 @@ pi_result piextUSMHostAlloc(void **ResultPtr, pi_context Context,
   } catch (const UsmAllocationException &Ex) {
     *ResultPtr = nullptr;
     return Ex.getError();
+  } catch (...) {
+    return PI_ERROR_UNKNOWN;
   }
 
   return PI_SUCCESS;
@@ -5525,6 +5534,8 @@ pi_result piextUSMFree(pi_context Context, void *Ptr) {
       Context->HostMemAllocContext->deallocate(Ptr);
     } catch (const UsmAllocationException &Ex) {
       return Ex.getError();
+    } catch (...) {
+      return PI_ERROR_UNKNOWN;
     }
     return PI_SUCCESS;
   }
diff --git a/sycl/plugins/level_zero/usm_allocator.cpp b/sycl/plugins/level_zero/usm_allocator.cpp
index 9425f93e97dcc..aa8118ec86fc6 100644
--- a/sycl/plugins/level_zero/usm_allocator.cpp
+++ b/sycl/plugins/level_zero/usm_allocator.cpp
@@ -20,16 +20,17 @@
 #include <vector>
 
 #include "usm_allocator.hpp"
+#include <CL/sycl/detail/spinlock.hpp>
 #include <iostream>
 
 namespace settings {
 // Minimum allocation size that will be requested from the system.
-static constexpr size_t SlabSize = 64 * 1024; // 64KB
+static constexpr size_t SlabMinSize = 64 * 1024; // 64KB
 
 // Allocations <= ChunkCutOff will use chunks from individual slabs.
 // Allocations >  ChunkCutOff will be rounded up to a multiple of
-// SlabSize and allocated to occupy the whole slab.
-static constexpr size_t ChunkCutOff = SlabSize / 2;
+// SlabMinSize and allocated to occupy the whole slab.
+static constexpr size_t ChunkCutOff = SlabMinSize / 2;
 // The largest size which is allocated via the allocator.
 // Allocations with size > CutOff bypass the USM allocator and
 // go directly to the runtime.
@@ -47,7 +48,7 @@ static constexpr BucketsArrayType generateBucketSizes() {
 // allow to write this in a nicer way.
 
 // Simple helper to compute power of 2
-#define P(n) (1 << (size_t)n)
+#define P(n) (1ULL << n)
 
   BucketsArrayType Sizes = {32,    48,
                             64,    96,
@@ -83,33 +84,31 @@ static constexpr BucketsArrayType generateBucketSizes() {
 
 static constexpr BucketsArrayType BucketSizes = generateBucketSizes();
 
-// The implementation expects that SlabSize is 2^n
-static_assert((SlabSize & (SlabSize - 1)) == 0,
-              "SlabSize must be a power of 2");
+// The implementation expects that SlabMinSize is 2^n
+static_assert((SlabMinSize & (SlabMinSize - 1)) == 0,
+              "SlabMinSize must be a power of 2");
 
-static size_t MaxPoolableSize = 1024;
-static size_t MaxPoolSize = 4096;
+static size_t MaxPoolableSize = 1;
 static size_t Capacity = 4;
+static size_t MaxPoolSize = 256;
 static size_t CurPoolSize = 0;
 
 // Protects the capacity checking of the pool.
-static std::mutex PoolLock;
+static sycl::detail::SpinLock PoolLock;
 
-static bool Trace = false;
 static class SetLimits {
 public:
   SetLimits() {
     // Parse optional parameters of this form (applicable to each context):
     // SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR_SETTINGS=[<MaxPoolableSize>][,[<Capacity>][,[<MaxPoolSize>]]]
     // MaxPoolableSize: Maximum poolable allocation size, specified in MB.
-    //                  Default 1GB.
+    //                  Default 1MB.
     // Capacity:        Number of pooled allocations in each bucket.
     //                  Default 4.
     // MaxPoolSize:     Maximum size of pool, specified in MB.
-    //                  Default 4GB.
+    //                  Default 256MB.
 
-    char *PoolParams = getenv("SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR_SETTINGS");
-    Trace = getenv("SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR_TRACE") != nullptr;
+    char *PoolParams = getenv("SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR");
     if (PoolParams != nullptr) {
       std::string Params(PoolParams);
       size_t Pos = Params.find(',');
@@ -132,12 +131,6 @@ static class SetLimits {
     }
     MaxPoolableSize *= (1 << 20);
     MaxPoolSize *= (1 << 20);
-
-    if (Trace) {
-      std::cout << "MaxPoolableSize = " << MaxPoolableSize << std::endl;
-      std::cout << "Capacity        = " << Capacity << std::endl;
-      std::cout << "MaxPoolSize     = " << MaxPoolSize << std::endl;
-    }
   }
 } L;
 } // namespace settings
@@ -167,21 +160,9 @@ static size_t AlignUp(size_t Val, size_t Alignment) {
   return (Val + Alignment - 1) & (~(Alignment - 1));
 }
 
-// Check whether a freed allocation can be retained in the pool or must be
-// returned to the system.
-static bool PoolHasCapacity(size_t Size) {
-  std::lock_guard<std::mutex> PL(settings::PoolLock);
-  size_t NewPoolSize = settings::CurPoolSize + Size;
-  if (settings::MaxPoolSize >= NewPoolSize) {
-    settings::CurPoolSize = NewPoolSize;
-    return true;
-  }
-  return false;
-}
-
 class Bucket;
 
-// Represents the allocated memory block of size 'settings::SlabSize'
+// Represents the allocated memory block of size 'settings::SlabMinSize'
 // Internally, it splits the memory block into chunks. The number of
 // chunks depends of the size of a Bucket which created the Slab.
 // The chunks
@@ -189,7 +170,7 @@ class Bucket;
 // so no locking happens here.
 class Slab {
 
-  // Pointer to the allocated memory of SlabSize bytes
+  // Pointer to the allocated memory of SlabMinSize bytes
   void *MemPtr;
 
   // Represents the current state of each chunk:
@@ -235,7 +216,7 @@ class Slab {
 
   void *getPtr() const { return MemPtr; }
   void *getEnd() const {
-    return static_cast<char *>(getPtr()) + settings::SlabSize;
+    return static_cast<char *>(getPtr()) + settings::SlabMinSize;
   }
 
   size_t getChunkSize() const;
@@ -280,11 +261,17 @@ class Bucket {
   SystemMemory &getMemHandle();
   USMAllocContext::USMAllocImpl &getUsmAllocCtx() { return OwnAllocCtx; }
 
-  bool BucketHasCapacity();
+  bool CanPool();
 
 private:
   void onFreeChunk(Slab &);
+
+  // Get a slab to be used for chunked allocations.
+  // These slabs are used for allocations <= ChunkCutOff and not pooled.
   decltype(AvailableSlabs.begin()) getAvailSlab();
+
+  // Get a slab that will be used as a whole for a single allocation.
+  // These slabs are > ChunkCutOff in size and pooled.
   decltype(AvailableSlabs.begin()) getAvailFullSlab();
 };
 
@@ -337,14 +324,13 @@ std::ostream &operator<<(std::ostream &Os, const Slab &Slab) {
 }
 
 Slab::Slab(Bucket &Bkt)
-    : // MemPtr(Bkt.getMemHandle().allocate(Bkt.getSize())),
-      // In case if bucket size is not that SlabSize % b.getSize() == 0, we
-      // would have some padding at the end of the slab.
-      Chunks(settings::SlabSize / Bkt.getSize()), NumAllocated{0},
+    : // In case bucket size is not a multiple of SlabMinSize, we would have
+      // some padding at the end of the slab.
+      Chunks(settings::SlabMinSize / Bkt.getSize()), NumAllocated{0},
       bucket(Bkt), SlabListIter{}, FirstFreeChunkIdx{0} {
   size_t SlabAllocSize = Bkt.getSize();
-  if (SlabAllocSize < settings::SlabSize)
-    SlabAllocSize = settings::SlabSize;
+  if (SlabAllocSize < settings::SlabMinSize)
+    SlabAllocSize = settings::SlabMinSize;
   MemPtr = Bkt.getMemHandle().allocate(SlabAllocSize);
   regSlab(*this);
 }
@@ -420,16 +406,16 @@ void Slab::unregSlabByAddr(void *Addr, Slab &Slab) {
 }
 
 void Slab::regSlab(Slab &Slab) {
-  void *StartAddr = AlignPtrDown(Slab.getPtr(), settings::SlabSize);
-  void *EndAddr = static_cast<char *>(StartAddr) + settings::SlabSize;
+  void *StartAddr = AlignPtrDown(Slab.getPtr(), settings::SlabMinSize);
+  void *EndAddr = static_cast<char *>(StartAddr) + settings::SlabMinSize;
 
   regSlabByAddr(StartAddr, Slab);
   regSlabByAddr(EndAddr, Slab);
 }
 
 void Slab::unregSlab(Slab &Slab) {
-  void *StartAddr = AlignPtrDown(Slab.getPtr(), settings::SlabSize);
-  void *EndAddr = static_cast<char *>(StartAddr) + settings::SlabSize;
+  void *StartAddr = AlignPtrDown(Slab.getPtr(), settings::SlabMinSize);
+  void *EndAddr = static_cast<char *>(StartAddr) + settings::SlabMinSize;
 
   unregSlabByAddr(StartAddr, Slab);
   unregSlabByAddr(EndAddr, Slab);
@@ -460,12 +446,15 @@ void Slab::freeChunk(void *Ptr) {
 bool Slab::hasAvail() { return NumAllocated != getNumChunks(); }
 
 auto Bucket::getAvailFullSlab() -> decltype(AvailableSlabs.begin()) {
+  // Return a slab that will be used for a single allocation.
   if (AvailableSlabs.size() == 0) {
     auto It = AvailableSlabs.insert(AvailableSlabs.begin(),
                                     std::make_unique<Slab>(*this));
     (*It)->setIterator(It);
   } else {
-    settings::CurPoolSize -= getSize();
+    // If a slab was available in the pool then note that the current pooled
+    // size has reduced by the size of this slab.
+    settings::CurPoolSize -= Size;
   }
 
   return AvailableSlabs.begin();
@@ -487,7 +476,7 @@ void Bucket::freeSlab(void *Ptr, Slab &Slab) {
   std::lock_guard<std::mutex> Lg(BucketLock);
   auto SlabIter = Slab.getIterator();
   assert(SlabIter != UnavailableSlabs.end());
-  if (PoolHasCapacity(getSize()) && BucketHasCapacity()) {
+  if (CanPool()) {
     auto It =
         AvailableSlabs.insert(AvailableSlabs.begin(), std::move(*SlabIter));
     UnavailableSlabs.erase(SlabIter);
@@ -513,7 +502,7 @@ void *Bucket::getChunk() {
   auto SlabIt = getAvailSlab();
   auto *FreeChunk = (*SlabIt)->getFreeChunk();
 
-  // If the slab is full, move it to unavailable slabs and update its itreator
+  // If the slab is full, move it to unavailable slabs and update its iterator
   if (!((*SlabIt)->hasAvail())) {
     auto It =
         UnavailableSlabs.insert(UnavailableSlabs.begin(), std::move(*SlabIt));
@@ -551,7 +540,7 @@ void Bucket::onFreeChunk(Slab &Slab) {
   // or release it to the system.
   if (Slab.getNumAllocated() == 0) {
     // Pool has no space so release it.
-    if (!(PoolHasCapacity(getSize()) && BucketHasCapacity())) {
+    if (!CanPool()) {
       // Remove the slab when all the chunks from it are deallocated
       // Note: since the slab is stored as unique_ptr, just remove it from
       // the list to remove the list to destroy the object
@@ -563,11 +552,16 @@ void Bucket::onFreeChunk(Slab &Slab) {
   }
 }
 
-// Check whether a bucket has capacity for retaining a freed allocation.
-bool Bucket::BucketHasCapacity() {
+// Check whether an allocation to be freed can be placed in the pool.
+bool Bucket::CanPool() {
+  std::lock_guard<sycl::detail::SpinLock> Lock{settings::PoolLock};
   size_t NewFreeSlabsInBucket = AvailableSlabs.size() + 1;
   if (settings::Capacity >= NewFreeSlabsInBucket) {
-    return true;
+    size_t NewPoolSize = settings::CurPoolSize + Size;
+    if (settings::MaxPoolSize >= NewPoolSize) {
+      settings::CurPoolSize = NewPoolSize;
+      return true;
+    }
   }
   return false;
 }
@@ -628,7 +622,7 @@ Bucket &USMAllocContext::USMAllocImpl::findBucket(size_t Size) {
 }
 
 void USMAllocContext::USMAllocImpl::deallocate(void *Ptr) {
-  auto *SlabPtr = AlignPtrDown(Ptr, settings::SlabSize);
+  auto *SlabPtr = AlignPtrDown(Ptr, settings::SlabMinSize);
 
   // Lock the map on read
   std::shared_lock<std::shared_timed_mutex> Lk(getKnownSlabsMapLock());

From 66c8138aaef714ae62b220087dba108861329755 Mon Sep 17 00:00:00 2001
From: rdeodhar <rajiv.deodhar@intel.com>
Date: Tue, 30 Mar 2021 18:31:33 -0700
Subject: [PATCH 06/16] Test corrected and build error fixed.

---
 sycl/plugins/level_zero/pi_level_zero.cpp |  9 +++++++--
 sycl/test/on-device/usm_pooling.cpp       | 20 +++++++++-----------
 2 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp
index e291fbc4243ea..d252bdf6453cb 100644
--- a/sycl/plugins/level_zero/pi_level_zero.cpp
+++ b/sycl/plugins/level_zero/pi_level_zero.cpp
@@ -2443,10 +2443,15 @@ pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size,
     //
   }
 
-  pi_result Result;
-  auto Alignment = std::min(Size, 64UL);
+  // Alignment is at most 64.
+  auto Alignment = Size;
+  if (Alignment > 64UL)
+    Alignment = 64UL;
+  // Alignment must be a power of 2, else make it 1.
   if ((Alignment & (Alignment - 1)) != 0)
     Alignment = 1;
+
+  pi_result Result;
   if (DeviceIsIntegrated) {
     Result = piextUSMHostAlloc(&Ptr, Context, nullptr, Size, Alignment);
   } else {
diff --git a/sycl/test/on-device/usm_pooling.cpp b/sycl/test/on-device/usm_pooling.cpp
index 6131957bb810a..a83a2e49f4f5e 100755
--- a/sycl/test/on-device/usm_pooling.cpp
+++ b/sycl/test/on-device/usm_pooling.cpp
@@ -1,5 +1,3 @@
-// REQUIRES: level_zero
-
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 
 // Allocate 2 items of 2MB. Free 2. Allocate 3 more of 2MB.
@@ -10,19 +8,19 @@
 // RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_DISABLE_USM_ALLOCATOR=1 %t.out s 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-NOPOOL
 
 // With pooling enabled and MaxPooolable=1MB: 1, 2, 3, 4, 5 allocs lead to ZE call.
-// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR_SETTINGS=1 %t.out h 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-12345
-// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR_SETTINGS=1 %t.out d 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-12345
-// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR_SETTINGS=1 %t.out s 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-12345
+// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR=1 %t.out h 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-12345
+// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR=1 %t.out d 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-12345
+// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR=1 %t.out s 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-12345
 
 // With With pooling enabled and capacity=1:       1, 2, 4, 5 allocs lead to ZE call.
-// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR_SETTINGS=,1 %t.out h 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-1245
-// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR_SETTINGS=,1 %t.out d 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-1245
-// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR_SETTINGS=,1 %t.out s 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-1245
+// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR=2,1 %t.out h 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-1245
+// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR=2,1 %t.out d 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-1245
+// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR=2,1 %t.out s 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-1245
 
 // With With pooling enabled and MaxPoolSize=2MB:  1, 2, 4, 5 allocs lead to ZE call.
-// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR_SETTINGS=,,2 %t.out h 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-1245
-// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR_SETTINGS=,,2 %t.out d 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-1245
-// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR_SETTINGS=,,2 %t.out s 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-1245
+// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR=2,,2 %t.out h 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-1245
+// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR=2,,2 %t.out d 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-1245
+// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR=2,,2 %t.out s 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-1245
 
 #include "CL/sycl.hpp"
 using namespace sycl;

From 3abfa22e8099218d421c94fdf4d5e3fd332e68a7 Mon Sep 17 00:00:00 2001
From: rdeodhar <rajiv.deodhar@intel.com>
Date: Tue, 30 Mar 2021 20:46:41 -0700
Subject: [PATCH 07/16] Test correction.

---
 sycl/test/on-device/usm_pooling.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sycl/test/on-device/usm_pooling.cpp b/sycl/test/on-device/usm_pooling.cpp
index a83a2e49f4f5e..b7d968eecfa21 100755
--- a/sycl/test/on-device/usm_pooling.cpp
+++ b/sycl/test/on-device/usm_pooling.cpp
@@ -1,3 +1,5 @@
+// REQUIRES: level_zero
+
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 
 // Allocate 2 items of 2MB. Free 2. Allocate 3 more of 2MB.

From f875ade5cb97d0c050da7861ba7a06ef70b67ec7 Mon Sep 17 00:00:00 2001
From: rdeodhar <rajiv.deodhar@intel.com>
Date: Thu, 1 Apr 2021 11:09:34 -0700
Subject: [PATCH 08/16] Change to alignment.

---
 sycl/plugins/level_zero/pi_level_zero.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp
index d252bdf6453cb..0fa91e6c66ad3 100644
--- a/sycl/plugins/level_zero/pi_level_zero.cpp
+++ b/sycl/plugins/level_zero/pi_level_zero.cpp
@@ -2443,13 +2443,13 @@ pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size,
     //
   }
 
-  // Alignment is at most 64.
+  // Choose an alignment is at most 64.
   auto Alignment = Size;
   if (Alignment > 64UL)
     Alignment = 64UL;
-  // Alignment must be a power of 2, else make it 1.
+  // When less than 64, it must be a power of 2, else make it 8.
   if ((Alignment & (Alignment - 1)) != 0)
-    Alignment = 1;
+    Alignment = 8;
 
   pi_result Result;
   if (DeviceIsIntegrated) {

From d3a03a74b065684cdf98742f7c668d33b4427b19 Mon Sep 17 00:00:00 2001
From: rdeodhar <rajiv.deodhar@intel.com>
Date: Thu, 1 Apr 2021 16:50:56 -0700
Subject: [PATCH 09/16] Change to buffer alignment.

---
 sycl/plugins/level_zero/pi_level_zero.cpp | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp
index ae347b22693cf..990afc5136502 100644
--- a/sycl/plugins/level_zero/pi_level_zero.cpp
+++ b/sycl/plugins/level_zero/pi_level_zero.cpp
@@ -2481,13 +2481,23 @@ pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size,
     //
   }
 
-  // Choose an alignment is at most 64.
+  // Choose an alignment that is at most 64 and is the next power of 2 for sizes
+  // less than 64.
   auto Alignment = Size;
-  if (Alignment > 64UL)
+  if (Alignment > 32UL)
     Alignment = 64UL;
-  // When less than 64, it must be a power of 2, else make it 8.
-  if ((Alignment & (Alignment - 1)) != 0)
-    Alignment = 8;
+  else if (Alignment > 16UL)
+    Alignment = 32UL;
+  else if (Alignment > 8UL)
+    Alignment = 16UL;
+  else if (Alignment > 4UL)
+    Alignment = 8UL;
+  else if (Alignment > 2UL)
+    Alignment = 4UL;
+  else if (Alignment > 1UL)
+    Alignment = 2UL;
+  else
+    Alignment = 1UL;
 
   pi_result Result;
   if (DeviceIsIntegrated) {

From 00c87db47cef05b03325c02206d0cde2c6a492e0 Mon Sep 17 00:00:00 2001
From: rdeodhar <rajiv.deodhar@intel.com>
Date: Tue, 6 Apr 2021 11:17:29 -0700
Subject: [PATCH 10/16] Changes based on review comments.

---
 sycl/plugins/level_zero/usm_allocator.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/plugins/level_zero/usm_allocator.cpp b/sycl/plugins/level_zero/usm_allocator.cpp
index aa8118ec86fc6..772a917d02698 100644
--- a/sycl/plugins/level_zero/usm_allocator.cpp
+++ b/sycl/plugins/level_zero/usm_allocator.cpp
@@ -132,7 +132,7 @@ static class SetLimits {
     MaxPoolableSize *= (1 << 20);
     MaxPoolSize *= (1 << 20);
   }
-} L;
+} UsmPoolSettings;
 } // namespace settings
 
 // Aligns the pointer down to the specified alignment

From 1733de68b04b4aa5c38db001a4861514b092152f Mon Sep 17 00:00:00 2001
From: rdeodhar <rajiv.deodhar@intel.com>
Date: Tue, 13 Apr 2021 13:10:02 -0700
Subject: [PATCH 11/16] Review responses.

---
 sycl/plugins/level_zero/pi_level_zero.cpp | 29 +++++++++++++----------
 sycl/plugins/level_zero/pi_level_zero.hpp |  4 +++-
 sycl/plugins/level_zero/usm_allocator.cpp |  2 +-
 3 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp
index f82e600284031..3320bad74789a 100644
--- a/sycl/plugins/level_zero/pi_level_zero.cpp
+++ b/sycl/plugins/level_zero/pi_level_zero.cpp
@@ -2571,7 +2571,7 @@ pi_result piMemRelease(pi_mem Mem) {
     } else {
       auto Buf = static_cast<_pi_buffer *>(Mem);
       if (!Buf->isSubBuffer()) {
-        piextUSMFree(Mem->Context, Mem->getZeHandle());
+        PI_CALL(piextUSMFree(Mem->Context, Mem->getZeHandle()));
       }
     }
     delete Mem;
@@ -5407,10 +5407,10 @@ static bool ShouldUseUSMAllocator() {
 
 static const bool UseUSMAllocator = ShouldUseUSMAllocator();
 
-pi_result USMDeviceAllocImpl(void **ResultPtr, pi_context Context,
-                             pi_device Device,
-                             pi_usm_mem_properties *Properties, size_t Size,
-                             pi_uint32 Alignment) {
+static pi_result USMDeviceAllocImpl(void **ResultPtr, pi_context Context,
+                                    pi_device Device,
+                                    pi_usm_mem_properties *Properties,
+                                    size_t Size, pi_uint32 Alignment) {
   PI_ASSERT(Context, PI_INVALID_CONTEXT);
   PI_ASSERT(Device, PI_INVALID_DEVICE);
 
@@ -5432,10 +5432,10 @@ pi_result USMDeviceAllocImpl(void **ResultPtr, pi_context Context,
   return PI_SUCCESS;
 }
 
-pi_result USMSharedAllocImpl(void **ResultPtr, pi_context Context,
-                             pi_device Device,
-                             pi_usm_mem_properties *Properties, size_t Size,
-                             pi_uint32 Alignment) {
+static pi_result USMSharedAllocImpl(void **ResultPtr, pi_context Context,
+                                    pi_device Device,
+                                    pi_usm_mem_properties *Properties,
+                                    size_t Size, pi_uint32 Alignment) {
   PI_ASSERT(Context, PI_INVALID_CONTEXT);
   PI_ASSERT(Device, PI_INVALID_DEVICE);
 
@@ -5459,9 +5459,9 @@ pi_result USMSharedAllocImpl(void **ResultPtr, pi_context Context,
   return PI_SUCCESS;
 }
 
-pi_result USMHostAllocImpl(void **ResultPtr, pi_context Context,
-                           pi_usm_mem_properties *Properties, size_t Size,
-                           pi_uint32 Alignment) {
+static pi_result USMHostAllocImpl(void **ResultPtr, pi_context Context,
+                                  pi_usm_mem_properties *Properties,
+                                  size_t Size, pi_uint32 Alignment) {
   PI_ASSERT(Context, PI_INVALID_CONTEXT);
 
   // Check that incorrect bits are not set in the properties.
@@ -5481,7 +5481,7 @@ pi_result USMHostAllocImpl(void **ResultPtr, pi_context Context,
   return PI_SUCCESS;
 }
 
-pi_result USMFreeImpl(pi_context Context, void *Ptr) {
+static pi_result USMFreeImpl(pi_context Context, void *Ptr) {
   ZE_CALL(zeMemFree, (Context->ZeContext, Ptr));
   return PI_SUCCESS;
 }
@@ -5609,6 +5609,9 @@ pi_result piextUSMHostAlloc(void **ResultPtr, pi_context Context,
     return USMHostAllocImpl(ResultPtr, Context, Properties, Size, Alignment);
   }
 
+  // There is a single allocator for Host USM allocations, so we don't need to
+  // find the allocator depending on context as we do for Shared and Device
+  // allocations.
   try {
     *ResultPtr = Context->HostMemAllocContext->allocate(Size, Alignment);
   } catch (const UsmAllocationException &Ex) {
diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp
index 8cf40fc9df894..b74cda0146146 100644
--- a/sycl/plugins/level_zero/pi_level_zero.hpp
+++ b/sycl/plugins/level_zero/pi_level_zero.hpp
@@ -206,7 +206,9 @@ struct _pi_context : _pi_object {
       // NOTE: one must additionally call initialize() to complete
       // PI context creation.
     }
-    // Create USM allocator context for host
+    // Create USM allocator context for host. Device and Shared USM allocations
+    // are device-specific. Host allocations are not device-dependent therefore
+    // we don't need a map with device as key.
     HostMemAllocContext = new USMAllocContext(
         std::unique_ptr<SystemMemory>(new USMHostMemoryAlloc(this)));
   }
diff --git a/sycl/plugins/level_zero/usm_allocator.cpp b/sycl/plugins/level_zero/usm_allocator.cpp
index 772a917d02698..92acc908149f3 100644
--- a/sycl/plugins/level_zero/usm_allocator.cpp
+++ b/sycl/plugins/level_zero/usm_allocator.cpp
@@ -34,7 +34,7 @@ static constexpr size_t ChunkCutOff = SlabMinSize / 2;
 // The largest size which is allocated via the allocator.
 // Allocations with size > CutOff bypass the USM allocator and
 // go directly to the runtime.
-static constexpr size_t CutOff = (size_t)1 << 31; // 2GB
+static constexpr size_t CutOff = 1024 * 1024; // 1MB
 
 // Unfortunately we cannot deduce the size of the array, so every change
 // to the number of buckets should be reflected here.

From 9391aa2da2da22b9ac89c22a732d03bc3baf706d Mon Sep 17 00:00:00 2001
From: rdeodhar <rajiv.deodhar@intel.com>
Date: Tue, 13 Apr 2021 14:52:25 -0700
Subject: [PATCH 12/16] Fix for max poolable size.

---
 sycl/plugins/level_zero/usm_allocator.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/plugins/level_zero/usm_allocator.cpp b/sycl/plugins/level_zero/usm_allocator.cpp
index 92acc908149f3..772a917d02698 100644
--- a/sycl/plugins/level_zero/usm_allocator.cpp
+++ b/sycl/plugins/level_zero/usm_allocator.cpp
@@ -34,7 +34,7 @@ static constexpr size_t ChunkCutOff = SlabMinSize / 2;
 // The largest size which is allocated via the allocator.
 // Allocations with size > CutOff bypass the USM allocator and
 // go directly to the runtime.
-static constexpr size_t CutOff = 1024 * 1024; // 1MB
+static constexpr size_t CutOff = (size_t)1 << 31; // 2GB
 
 // Unfortunately we cannot deduce the size of the array, so every change
 // to the number of buckets should be reflected here.

From eae431beaf77e55bd7f2a533f481fe6297d567ec Mon Sep 17 00:00:00 2001
From: rdeodhar <rajiv.deodhar@intel.com>
Date: Wed, 14 Apr 2021 12:40:06 -0700
Subject: [PATCH 13/16] Clarified env var settings.

---
 sycl/doc/EnvironmentVariables.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/doc/EnvironmentVariables.md b/sycl/doc/EnvironmentVariables.md
index 2e320d7676197..7cde94e30a6f5 100644
--- a/sycl/doc/EnvironmentVariables.md
+++ b/sycl/doc/EnvironmentVariables.md
@@ -28,7 +28,7 @@ subject to change. Do not rely on these variables in production code.
 | SYCL_DEVICELIB_NO_FALLBACK | Any(\*) | Disable loading and linking of device library images |
 | SYCL_PI_LEVEL_ZERO_MAX_COMMAND_LIST_CACHE | Positive integer | Maximum number of oneAPI Level Zero Command lists that can be allocated with no reuse before throwing an "out of resources" error. Default is 20000, threshold may be increased based on resource availabilty and workload demand. |
 | SYCL_PI_LEVEL_ZERO_DISABLE_USM_ALLOCATOR | Any(\*) | Disable USM allocator in Level Zero plugin (each memory request will go directly to Level Zero runtime) |
-| SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR | MaxPoolableSize,Capacity,MaxPoolSize | Maximum allocation size in MB that may be pooled, number of pooled allocations in each size bucket, maximum size of pool in MB. Defaults are 1, 4, 256. |
+| SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR | MaxPoolableSize,Capacity,MaxPoolSize | Maximum allocation size in MB that may be pooled, number of pooled allocations in each size bucket, maximum size of pool in MB. Values specified as positive integers. Defaults are 1, 4, 256. |
 | SYCL_PI_LEVEL_ZERO_BATCH_SIZE | Integer | Sets a preferred number of commands to batch into a command list before executing the command list. A value of 0 causes the batch size to be adjusted dynamically. A value greater than 0 specifies fixed size batching, with the batch size set to the specified value. The default is 0. |
 | SYCL_PI_LEVEL_ZERO_FILTER_EVENT_WAIT_LIST | Integer | When set to 0, disables filtering of signaled events from wait lists when using the Level Zero backend. The default is 1. |
 | SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE | Integer | Allows the use of copy engine, if available in the device, in Level Zero plugin to transfer SYCL buffer or image data between the host and/or device(s) and to fill SYCL buffer or image data in device or shared memory. The default is 1. |

From 44d5a0cb3d55bc97f9d18ac16b69479b2d03368f Mon Sep 17 00:00:00 2001
From: rdeodhar <rajiv.deodhar@intel.com>
Date: Fri, 16 Apr 2021 10:25:08 -0700
Subject: [PATCH 14/16] Changed some function names for uniformity. Added
 comments.

---
 sycl/plugins/level_zero/usm_allocator.cpp | 24 ++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/sycl/plugins/level_zero/usm_allocator.cpp b/sycl/plugins/level_zero/usm_allocator.cpp
index 772a917d02698..ce389d1c32a11 100644
--- a/sycl/plugins/level_zero/usm_allocator.cpp
+++ b/sycl/plugins/level_zero/usm_allocator.cpp
@@ -211,8 +211,11 @@ class Slab {
 
   size_t getNumAllocated() const { return NumAllocated; }
 
-  void *getFreeChunk();
-  void *getFullSlab();
+  // Get pointer to allocation that is one piece of this slab.
+  void *getChunk();
+
+  // Get pointer to allocation that is this entire slab.
+  void *getSlab();
 
   void *getPtr() const { return MemPtr; }
   void *getEnd() const {
@@ -250,17 +253,25 @@ class Bucket {
   Bucket(size_t Sz, USMAllocContext::USMAllocImpl &AllocCtx)
       : Size{Sz}, OwnAllocCtx{AllocCtx} {}
 
+  // Get pointer to allocation that is one piece of an available slab in this
+  // bucket.
   void *getChunk();
+
+  // Get pointer to allocation that is a full slab in this bucket.
   void *getSlab();
 
   size_t getSize() const { return Size; }
 
+  // Free an allocation that is one piece of a slab in this bucket.
   void freeChunk(void *Ptr, Slab &Slab);
+
+  // Free an allocation that is a full slab in this bucket.
   void freeSlab(void *Ptr, Slab &Slab);
 
   SystemMemory &getMemHandle();
   USMAllocContext::USMAllocImpl &getUsmAllocCtx() { return OwnAllocCtx; }
 
+  // Check whether an allocation to be freed can be placed in the pool.
   bool CanPool();
 
 private:
@@ -352,7 +363,7 @@ size_t Slab::FindFirstAvailableChunkIdx() const {
   return static_cast<size_t>(-1);
 }
 
-void *Slab::getFreeChunk() {
+void *Slab::getChunk() {
   assert(NumAllocated != Chunks.size());
 
   const size_t ChunkIdx = FindFirstAvailableChunkIdx();
@@ -370,7 +381,7 @@ void *Slab::getFreeChunk() {
   return FreeChunk;
 }
 
-void *Slab::getFullSlab() { return getPtr(); }
+void *Slab::getSlab() { return getPtr(); }
 
 Bucket &Slab::getBucket() { return bucket; }
 const Bucket &Slab::getBucket() const { return bucket; }
@@ -464,7 +475,7 @@ void *Bucket::getSlab() {
   std::lock_guard<std::mutex> Lg(BucketLock);
 
   auto SlabIt = getAvailFullSlab();
-  auto *FreeSlab = (*SlabIt)->getFullSlab();
+  auto *FreeSlab = (*SlabIt)->getSlab();
   auto It =
       UnavailableSlabs.insert(UnavailableSlabs.begin(), std::move(*SlabIt));
   AvailableSlabs.erase(SlabIt);
@@ -500,7 +511,7 @@ void *Bucket::getChunk() {
   std::lock_guard<std::mutex> Lg(BucketLock);
 
   auto SlabIt = getAvailSlab();
-  auto *FreeChunk = (*SlabIt)->getFreeChunk();
+  auto *FreeChunk = (*SlabIt)->getChunk();
 
   // If the slab is full, move it to unavailable slabs and update its iterator
   if (!((*SlabIt)->hasAvail())) {
@@ -552,7 +563,6 @@ void Bucket::onFreeChunk(Slab &Slab) {
   }
 }
 
-// Check whether an allocation to be freed can be placed in the pool.
 bool Bucket::CanPool() {
   std::lock_guard<sycl::detail::SpinLock> Lock{settings::PoolLock};
   size_t NewFreeSlabsInBucket = AvailableSlabs.size() + 1;

From 7d95049cdac60f0c387ddb2b158ebdd9a07944f3 Mon Sep 17 00:00:00 2001
From: rdeodhar <rajiv.deodhar@intel.com>
Date: Fri, 16 Apr 2021 10:44:10 -0700
Subject: [PATCH 15/16] Modified the env var documentation.

---
 sycl/doc/EnvironmentVariables.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/doc/EnvironmentVariables.md b/sycl/doc/EnvironmentVariables.md
index 51c3f4f6af26c..258d137080bda 100644
--- a/sycl/doc/EnvironmentVariables.md
+++ b/sycl/doc/EnvironmentVariables.md
@@ -28,7 +28,7 @@ subject to change. Do not rely on these variables in production code.
 | SYCL_DEVICELIB_NO_FALLBACK | Any(\*) | Disable loading and linking of device library images |
 | SYCL_PI_LEVEL_ZERO_MAX_COMMAND_LIST_CACHE | Positive integer | Maximum number of oneAPI Level Zero Command lists that can be allocated with no reuse before throwing an "out of resources" error. Default is 20000, threshold may be increased based on resource availabilty and workload demand. |
 | SYCL_PI_LEVEL_ZERO_DISABLE_USM_ALLOCATOR | Any(\*) | Disable USM allocator in Level Zero plugin (each memory request will go directly to Level Zero runtime) |
-| SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR | MaxPoolableSize,Capacity,MaxPoolSize | Maximum allocation size in MB that may be pooled, number of pooled allocations in each size bucket, maximum size of pool in MB. Values specified as positive integers. Defaults are 1, 4, 256. |
+| SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR | MaxPoolableSize,Capacity,MaxPoolSize | Values specified as positive integers. Defaults are 1, 4, 256. MaxPoolableSize is the maximum allocation size in MB that may be pooled. Capacity is the number of allocations in each size range that are freed by the program but retained in the pool for reallocation. Size ranges follow this pattern: 32, 48, 64, 96, 128, 192, and so on, i.e., powers of 2, with one range in between. MaxPoolSize is the maximum size of the pool in MB. |
 | SYCL_PI_LEVEL_ZERO_BATCH_SIZE | Integer | Sets a preferred number of commands to batch into a command list before executing the command list. A value of 0 causes the batch size to be adjusted dynamically. A value greater than 0 specifies fixed size batching, with the batch size set to the specified value. The default is 0. |
 | SYCL_PI_LEVEL_ZERO_FILTER_EVENT_WAIT_LIST | Integer | When set to 0, disables filtering of signaled events from wait lists when using the Level Zero backend. The default is 1. |
 | SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE | Integer | Allows the use of copy engine, if available in the device, in Level Zero plugin to transfer SYCL buffer or image data between the host and/or device(s) and to fill SYCL buffer or image data in device or shared memory. The default is 1. |

From 1aa0a12bdbd3abba2e7e5e92c41e411848cb8460 Mon Sep 17 00:00:00 2001
From: rdeodhar <rajiv.deodhar@intel.com>
Date: Fri, 16 Apr 2021 14:28:51 -0700
Subject: [PATCH 16/16] Moved pool settings into a class. Added comments.

---
 sycl/plugins/level_zero/usm_allocator.cpp | 41 ++++++++++++++++-------
 1 file changed, 28 insertions(+), 13 deletions(-)

diff --git a/sycl/plugins/level_zero/usm_allocator.cpp b/sycl/plugins/level_zero/usm_allocator.cpp
index ce389d1c32a11..e675c73ca8437 100644
--- a/sycl/plugins/level_zero/usm_allocator.cpp
+++ b/sycl/plugins/level_zero/usm_allocator.cpp
@@ -23,6 +23,21 @@
 #include <CL/sycl/detail/spinlock.hpp>
 #include <iostream>
 
+// USM allocations are a mimimum of 64KB in size even when a smaller size is
+// requested. The implementation distinguishes between allocations of size
+// ChunkCutOff (32KB) and those that are larger.
+// Allocation requests smaller than ChunkCutoff use chunks taken from a single
+// 64KB USM allocation. Thus, for example, for 8-byte allocations, only 1 in
+// ~8000 requests results in a new USM allocation. Freeing results only in a
+// chunk of a larger 64KB allocation to be marked as available and no real
+// return to the system. An allocation is returned to the system only when all
+// chunks in a 64KB allocation are freed by the program.
+// Allocations larger than ChunkCutOff use a separate USM allocation for each
+// request. These are subject to "pooling". That is, when such an allocation is
+// freed by the program it is retained in a pool. The pool is available for
+// future allocations, which means there are fewer actual USM
+// allocations/deallocations.
+
 namespace settings {
 // Minimum allocation size that will be requested from the system.
 static constexpr size_t SlabMinSize = 64 * 1024; // 64KB
@@ -88,16 +103,16 @@ static constexpr BucketsArrayType BucketSizes = generateBucketSizes();
 static_assert((SlabMinSize & (SlabMinSize - 1)) == 0,
               "SlabMinSize must be a power of 2");
 
-static size_t MaxPoolableSize = 1;
-static size_t Capacity = 4;
-static size_t MaxPoolSize = 256;
-static size_t CurPoolSize = 0;
-
 // Protects the capacity checking of the pool.
 static sycl::detail::SpinLock PoolLock;
 
 static class SetLimits {
 public:
+  size_t MaxPoolableSize = 1;
+  size_t Capacity = 4;
+  size_t MaxPoolSize = 256;
+  size_t CurPoolSize = 0;
+
   SetLimits() {
     // Parse optional parameters of this form (applicable to each context):
     // SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR_SETTINGS=[<MaxPoolableSize>][,[<Capacity>][,[<MaxPoolSize>]]]
@@ -132,7 +147,7 @@ static class SetLimits {
     MaxPoolableSize *= (1 << 20);
     MaxPoolSize *= (1 << 20);
   }
-} UsmPoolSettings;
+} USMPoolSettings;
 } // namespace settings
 
 // Aligns the pointer down to the specified alignment
@@ -465,7 +480,7 @@ auto Bucket::getAvailFullSlab() -> decltype(AvailableSlabs.begin()) {
   } else {
     // If a slab was available in the pool then note that the current pooled
     // size has reduced by the size of this slab.
-    settings::CurPoolSize -= Size;
+    settings::USMPoolSettings.CurPoolSize -= Size;
   }
 
   return AvailableSlabs.begin();
@@ -566,10 +581,10 @@ void Bucket::onFreeChunk(Slab &Slab) {
 bool Bucket::CanPool() {
   std::lock_guard<sycl::detail::SpinLock> Lock{settings::PoolLock};
   size_t NewFreeSlabsInBucket = AvailableSlabs.size() + 1;
-  if (settings::Capacity >= NewFreeSlabsInBucket) {
-    size_t NewPoolSize = settings::CurPoolSize + Size;
-    if (settings::MaxPoolSize >= NewPoolSize) {
-      settings::CurPoolSize = NewPoolSize;
+  if (settings::USMPoolSettings.Capacity >= NewFreeSlabsInBucket) {
+    size_t NewPoolSize = settings::USMPoolSettings.CurPoolSize + Size;
+    if (settings::USMPoolSettings.MaxPoolSize >= NewPoolSize) {
+      settings::USMPoolSettings.CurPoolSize = NewPoolSize;
       return true;
     }
   }
@@ -582,7 +597,7 @@ void *USMAllocContext::USMAllocImpl::allocate(size_t Size) {
   if (Size == 0)
     return nullptr;
 
-  if (Size > settings::MaxPoolableSize) {
+  if (Size > settings::USMPoolSettings.MaxPoolableSize) {
     return getMemHandle().allocate(Size);
   }
 
@@ -605,7 +620,7 @@ void *USMAllocContext::USMAllocImpl::allocate(size_t Size, size_t Alignment) {
 
   // Check if requested allocation size is within pooling limit.
   // If not, just request aligned pointer from the system.
-  if (AlignedSize > settings::MaxPoolableSize) {
+  if (AlignedSize > settings::USMPoolSettings.MaxPoolableSize) {
     return getMemHandle().allocate(Size, Alignment);
   }