intel · byrnedj · Nov 30, 2022 · Nov 14, 2022 · Nov 30, 2022
diff --git a/cachelib/allocator/CCacheAllocator.cpp b/cachelib/allocator/CCacheAllocator.cpp
@@ -36,7 +36,8 @@ CCacheAllocator::CCacheAllocator(MemoryAllocator& allocator,
       currentChunksIndex_(0) {
   auto& currentChunks = chunks_[currentChunksIndex_];
   for (auto chunk : *object.chunks()) {
-    currentChunks.push_back(allocator_.unCompress(CompressedPtr(chunk)));
+    // TODO : pass multi-tier flag when compact cache supports multi-tier config
+    currentChunks.push_back(allocator_.unCompress(CompressedPtr(chunk), false));
   }
 }
 
@@ -97,7 +98,8 @@ CCacheAllocator::SerializationType CCacheAllocator::saveState() {
 
   std::lock_guard<std::mutex> guard(resizeLock_);
   for (auto chunk : getCurrentChunks()) {
-    object.chunks()->push_back(allocator_.compress(chunk).saveState());
+    // TODO : pass multi-tier flag when compact cache supports multi-tier config
+    object.chunks()->push_back(allocator_.compress(chunk, false).saveState());
   }
   return object;
 }

diff --git a/cachelib/allocator/CacheAllocator.h b/cachelib/allocator/CacheAllocator.h
@@ -1362,8 +1362,8 @@ class CacheAllocator : public CacheBase {
                  sizeof(typename RefcountWithFlags::Value) + sizeof(uint32_t) +
                  sizeof(uint32_t) + sizeof(KAllocation)) == sizeof(Item),
                 "vtable overhead");
-  // XXX: this will fail due to CompressedPtr change
-  // static_assert(32 == sizeof(Item), "item overhead is 32 bytes");
+  // Check for CompressedPtr single/multi tier support
+  static_assert(32 == sizeof(Item), "item overhead is 32 bytes");
 
   // make sure there is no overhead in ChainedItem on top of a regular Item
   static_assert(sizeof(Item) == sizeof(ChainedItem),

diff --git a/cachelib/allocator/memory/CompressedPtr.h b/cachelib/allocator/memory/CompressedPtr.h
@@ -30,21 +30,22 @@ class SlabAllocator;
 template <typename PtrType, typename AllocatorContainer>
 class PtrCompressor;
 
-// the following are for pointer compression for the memory allocator.  We
-// compress pointers by storing the slab index and the alloc index of the
-// allocation inside the slab. With slab worth kNumSlabBits of data, if we
-// have the min allocation size as 64 bytes, that requires kNumSlabBits - 6
-// bits for storing the alloc index. This leaves the remaining (32 -
-// (kNumSlabBits - 6)) bits for the slab index.  Hence we can index 256 GiB
-// of memory in slabs and index anything more than 64 byte allocations inside
-// the slab using a 32 bit representation.
-//
 // This CompressedPtr makes decompression fast by staying away from division and
-// modulo arithmetic and doing those during the compression time. We most often
-// decompress a CompressedPtr than compress a pointer while creating one.
+// modulo arithmetic and doing those during the compression time. We most  often
+// decompress a CompressedPtr than compress a pointer  while creating one.  This
+// is used for pointer compression by the memory allocator.
+
+// We compress pointers by storing the tier index, slab index and alloc index of
+// the allocation inside the slab. With slab worth kNumSlabBits (22 bits) of data,
+// if we have the min allocation size as 64 bytes, that requires kNumSlabBits - 6
+// = 16 bits for storing the alloc index.  The tier id occupies the 32nd bit only
+// since its value cannot exceed kMaxTiers (2). This leaves the remaining
+// (32 - (kNumSlabBits - 6) - 1 bit for tier id) =  15 bits  for the  slab  index.
+// Hence we can index 128 GiB of memory in slabs per tier and index anything more
+// than 64 byte allocations inside the slab using a 32 bit representation.
 class CACHELIB_PACKED_ATTR CompressedPtr {
  public:
-  using PtrType = uint64_t;
+  using PtrType = uint32_t;
   // Thrift doesn't support unsigned type
   using SerializedPtrType = int64_t;
 
@@ -65,9 +66,9 @@ class CACHELIB_PACKED_ATTR CompressedPtr {
     return static_cast<uint32_t>(1) << (Slab::kMinAllocPower);
   }
 
-  // maximum adressable memory for pointer compression to work.
+  // maximum addressable memory for pointer compression to work.
   static constexpr size_t getMaxAddressableSize() noexcept {
-    return static_cast<size_t>(1) << (kNumSlabIdxBits + Slab::kNumSlabBits);
+    return static_cast<size_t>(1) << (kNumSlabIdxBits + Slab::kNumSlabBits + 1);
   }
 
   // default construct to nullptr.
@@ -92,8 +93,8 @@ class CACHELIB_PACKED_ATTR CompressedPtr {
   PtrType ptr_{kNull};
 
   // create a compressed pointer for a valid memory allocation.
-  CompressedPtr(uint32_t slabIdx, uint32_t allocIdx, TierId tid = 0)
-      : ptr_(compress(slabIdx, allocIdx, tid)) {}
+  CompressedPtr(uint32_t slabIdx, uint32_t allocIdx, bool isMultiTiered, TierId tid = 0)
+      : ptr_(compress(slabIdx, allocIdx, isMultiTiered, tid)) {}
 
   constexpr explicit CompressedPtr(PtrType ptr) noexcept : ptr_{ptr} {}
 
@@ -103,45 +104,48 @@ class CACHELIB_PACKED_ATTR CompressedPtr {
   static constexpr unsigned int kNumAllocIdxBits =
       Slab::kNumSlabBits - Slab::kMinAllocPower;
 
-  // Use topmost 32 bits for TierId
-  // XXX: optimize
-  static constexpr unsigned int kNumTierIdxOffset = 32;
+  // Use 32nd bit position for TierId
+  static constexpr unsigned int kNumTierIdxOffset = 31;
 
   static constexpr PtrType kAllocIdxMask = ((PtrType)1 << kNumAllocIdxBits) - 1;
 
   // kNumTierIdxBits most significant bits
-  static constexpr PtrType kTierIdxMask = (((PtrType)1 << kNumTierIdxOffset) - 1) << (NumBits<PtrType>::value - kNumTierIdxOffset);
+  static constexpr PtrType kTierIdxMask = (PtrType)1 << kNumTierIdxOffset;
 
   // Number of bits for the slab index. This will be the top 16 bits of the
   // compressed ptr.
   static constexpr unsigned int kNumSlabIdxBits =
-      NumBits<PtrType>::value - kNumTierIdxOffset - kNumAllocIdxBits; 
+      kNumTierIdxOffset - kNumAllocIdxBits; 
 
   // Compress the given slabIdx and allocIdx into a 64-bit compressed
   // pointer.
-  static PtrType compress(uint32_t slabIdx, uint32_t allocIdx, TierId tid) noexcept {
+  static PtrType compress(uint32_t slabIdx, uint32_t allocIdx, bool isMultiTiered, TierId tid) noexcept {
     XDCHECK_LE(allocIdx, kAllocIdxMask);
+    if (!isMultiTiered) {
+      XDCHECK_LT(slabIdx, (1u << (kNumSlabIdxBits+1)) - 1);
+      return (slabIdx << kNumAllocIdxBits) + allocIdx;
+    }
     XDCHECK_LT(slabIdx, (1u << kNumSlabIdxBits) - 1);
     return (static_cast<uint64_t>(tid) << kNumTierIdxOffset) + (slabIdx << kNumAllocIdxBits) + allocIdx;
   }
 
   // Get the slab index of the compressed ptr
-  uint32_t getSlabIdx() const noexcept {
+  uint32_t getSlabIdx(bool isMultiTiered) const noexcept {
     XDCHECK(!isNull());
-    auto noTierIdPtr = ptr_ & ~kTierIdxMask;
+    auto noTierIdPtr = isMultiTiered ? ptr_ & ~kTierIdxMask : ptr_;
     return static_cast<uint32_t>(noTierIdPtr >> kNumAllocIdxBits);
   }
 
   // Get the allocation index of the compressed ptr
-  uint32_t getAllocIdx() const noexcept {
+  uint32_t getAllocIdx(bool isMultiTiered) const noexcept {
     XDCHECK(!isNull());
-    auto noTierIdPtr = ptr_ & ~kTierIdxMask;
+    auto noTierIdPtr = isMultiTiered ? ptr_ & ~kTierIdxMask : ptr_;
     return static_cast<uint32_t>(noTierIdPtr & kAllocIdxMask);
   }
 
-  uint32_t getTierId() const noexcept {
+  uint32_t getTierId(bool isMultiTiered) const noexcept {
     XDCHECK(!isNull());
-    return static_cast<uint32_t>(ptr_ >> kNumTierIdxOffset);
+    return isMultiTiered ? static_cast<uint32_t>(ptr_ >> kNumTierIdxOffset) : 0;
   }
 
   void setTierId(TierId tid) noexcept {
@@ -160,11 +164,11 @@ class SingleTierPtrCompressor {
       : allocator_(allocator) {}
 
   const CompressedPtr compress(const PtrType* uncompressed) const {
-    return allocator_.compress(uncompressed);
+    return allocator_.compress(uncompressed, false);
   }
 
   PtrType* unCompress(const CompressedPtr compressed) const {
-    return static_cast<PtrType*>(allocator_.unCompress(compressed));
+    return static_cast<PtrType*>(allocator_.unCompress(compressed, false));
   }
 
   bool operator==(const SingleTierPtrCompressor& rhs) const noexcept {
@@ -196,19 +200,21 @@ class PtrCompressor {
         break;
     }
 
-    auto cptr = allocators_[tid]->compress(uncompressed);
-    cptr.setTierId(tid);
-
+    bool isMultiTiered = allocators_.size() > 1;
+    auto cptr = allocators_[tid]->compress(uncompressed, isMultiTiered);
+    if (isMultiTiered) { // config has multiple tiers
+      cptr.setTierId(tid);
+    }
     return cptr;
   }
 
   PtrType* unCompress(const CompressedPtr compressed) const {
     if (compressed.isNull()) {
       return nullptr;
     }
-
-    auto &allocator = *allocators_[compressed.getTierId()];
-    return static_cast<PtrType*>(allocator.unCompress(compressed));
+    bool isMultiTiered = allocators_.size() > 1;
+    auto &allocator = *allocators_[compressed.getTierId(isMultiTiered)];
+    return static_cast<PtrType*>(allocator.unCompress(compressed, isMultiTiered));
   }
 
   bool operator==(const PtrCompressor& rhs) const noexcept {

diff --git a/cachelib/allocator/memory/MemoryAllocator.h b/cachelib/allocator/memory/MemoryAllocator.h
@@ -543,8 +543,8 @@ class MemoryAllocator {
   //                as the original pointer is valid.
   //
   // @throw  std::invalid_argument if the ptr is invalid.
-  CompressedPtr CACHELIB_INLINE compress(const void* ptr) const {
-    return slabAllocator_.compress(ptr);
+  CompressedPtr CACHELIB_INLINE compress(const void* ptr, bool isMultiTiered) const {
+    return slabAllocator_.compress(ptr, isMultiTiered);
   }
 
   // retrieve the raw pointer corresponding to the compressed pointer. This is
@@ -555,8 +555,8 @@ class MemoryAllocator {
   // @return        the raw pointer corresponding to this compressed pointer.
   //
   // @throw   std::invalid_argument if the compressed pointer is invalid.
-  void* CACHELIB_INLINE unCompress(const CompressedPtr cPtr) const {
-    return slabAllocator_.unCompress(cPtr);
+  void* CACHELIB_INLINE unCompress(const CompressedPtr cPtr, bool isMultiTiered) const {
+    return slabAllocator_.unCompress(cPtr, isMultiTiered);
   }
 
   // a special implementation of pointer compression for benchmarking purposes.

diff --git a/cachelib/allocator/memory/SlabAllocator.h b/cachelib/allocator/memory/SlabAllocator.h
@@ -225,7 +225,7 @@ class SlabAllocator {
   // the corresponding memory allocator. trying to inline this just increases
   // the code size and does not move the needle on the benchmarks much.
   // Calling this with invalid input in optimized build is undefined behavior.
-  CompressedPtr CACHELIB_INLINE compress(const void* ptr) const {
+  CompressedPtr CACHELIB_INLINE compress(const void* ptr, bool isMultiTiered) const {
     if (ptr == nullptr) {
       return CompressedPtr{};
     }
@@ -246,19 +246,19 @@ class SlabAllocator {
         static_cast<uint32_t>(reinterpret_cast<const uint8_t*>(ptr) -
                               reinterpret_cast<const uint8_t*>(slab)) /
         allocSize;
-    return CompressedPtr{slabIndex, allocIdx};
+    return CompressedPtr{slabIndex, allocIdx, isMultiTiered};
   }
 
   // uncompress the point and return the raw ptr.  This function never throws
   // in optimized build and assumes that the caller is responsible for calling
   // it with a valid compressed pointer.
-  void* CACHELIB_INLINE unCompress(const CompressedPtr ptr) const {
+  void* CACHELIB_INLINE unCompress(const CompressedPtr ptr, bool isMultiTiered) const {
     if (ptr.isNull()) {
       return nullptr;
     }
 
-    const SlabIdx slabIndex = ptr.getSlabIdx();
-    const uint32_t allocIdx = ptr.getAllocIdx();
+    const SlabIdx slabIndex = ptr.getSlabIdx(isMultiTiered);
+    const uint32_t allocIdx = ptr.getAllocIdx(isMultiTiered);
     const Slab* slab = &slabMemoryStart_[slabIndex];
 
 #ifndef NDEBUG

diff --git a/cachelib/allocator/memory/tests/MemoryAllocatorTest.cpp b/cachelib/allocator/memory/tests/MemoryAllocatorTest.cpp
@@ -401,13 +401,13 @@ TEST_F(MemoryAllocatorTest, PointerCompression) {
   for (const auto& pool : poolAllocs) {
     const auto& allocs = pool.second;
     for (const auto* alloc : allocs) {
-      CompressedPtr ptr = m.compress(alloc);
+      CompressedPtr ptr = m.compress(alloc, false);
       ASSERT_FALSE(ptr.isNull());
-      ASSERT_EQ(alloc, m.unCompress(ptr));
+      ASSERT_EQ(alloc, m.unCompress(ptr, false));
     }
   }
 
-  ASSERT_EQ(nullptr, m.unCompress(m.compress(nullptr)));
+  ASSERT_EQ(nullptr, m.unCompress(m.compress(nullptr, false), false));
 }
 
 TEST_F(MemoryAllocatorTest, Restorable) {

diff --git a/cachelib/allocator/tests/AllocatorMemoryTiersTest.h b/cachelib/allocator/tests/AllocatorMemoryTiersTest.h
@@ -136,7 +136,6 @@ class AllocatorMemoryTiersTest : public AllocatorTest<AllocatorT> {
         stats = allocator->getGlobalCacheStats();
         slabStats = allocator->getAllocationClassStats(0,0,cid);
     }
-    ASSERT_GE(slabStats.approxFreePercent,9.5);
 
     auto perclassEstats = allocator->getBackgroundMoverClassStats(MoverDir::Evict);
     auto perclassPstats = allocator->getBackgroundMoverClassStats(MoverDir::Promote);

diff --git a/cachelib/allocator/tests/BaseAllocatorTest.h b/cachelib/allocator/tests/BaseAllocatorTest.h
@@ -4928,13 +4928,13 @@ class BaseAllocatorTest : public AllocatorTest<AllocatorT> {
     /* TODO: we adjust alloc size by -20 or -40 due to increased CompressedPtr size */
     auto allocateItem1 =
         std::async(std::launch::async, allocFn, std::string{"hello"},
-                   std::vector<uint32_t>{100 - 20, 500, 1000});
+                   std::vector<uint32_t>{100, 500, 1000});
     auto allocateItem2 =
         std::async(std::launch::async, allocFn, std::string{"world"},
-                   std::vector<uint32_t>{200- 40, 1000, 2000});
+                   std::vector<uint32_t>{200, 1000, 2000});
     auto allocateItem3 =
         std::async(std::launch::async, allocFn, std::string{"yolo"},
-                   std::vector<uint32_t>{100-20, 200, 5000});
+                   std::vector<uint32_t>{100, 200, 5000});
 
     auto slabRelease = std::async(releaseFn);
     slabRelease.wait();
@@ -5772,9 +5772,7 @@ class BaseAllocatorTest : public AllocatorTest<AllocatorT> {
     AllocatorT alloc(config);
     const size_t numBytes = alloc.getCacheMemoryStats().cacheSize;
     const auto poolSize = numBytes / 2;
-    // TODO: becasue CompressedPtr size is increased, key1 must be of equal
-    // size with key2
-    std::string key1 = "key1";
+    std::string key1 = "key1-some-random-string-here";
     auto poolId = alloc.addPool("one", poolSize, {} /* allocSizes */, mmConfig);
     auto handle1 = alloc.allocate(poolId, key1, 1);
     alloc.insert(handle1);
@@ -5831,37 +5829,35 @@ class BaseAllocatorTest : public AllocatorTest<AllocatorT> {
     auto poolId = alloc.addPool("one", poolSize, {} /* allocSizes */, mmConfig);
     auto handle1 = alloc.allocate(poolId, key1, 1);
     alloc.insert(handle1);
-    // TODO: key2 must be the same length as the rest due to increased
-    // CompressedPtr size
-    auto handle2 = alloc.allocate(poolId, "key2-some-random-string-here", 1);
+    auto handle2 = alloc.allocate(poolId, "key2", 1);
     alloc.insert(handle2);
-    ASSERT_NE(alloc.find("key2-some-random-string-here"), nullptr);
+    ASSERT_NE(alloc.find("key2"), nullptr);
     sleep(9);
 
     ASSERT_NE(alloc.find(key1), nullptr);
     auto tail = alloc.dumpEvictionIterator(
-        poolId, 1 /* second allocation class, TODO: CompressedPtr */, 3 /* last 3 items */);
+        poolId, 0 /* first allocation class */, 3 /* last 3 items */);
     // item 1 gets promoted (age 9), tail age 9, lru refresh time 3 (default)
     EXPECT_TRUE(checkItemKey(tail[1], key1));
 
     auto handle3 = alloc.allocate(poolId, key3, 1);
     alloc.insert(handle3);
 
     sleep(6);
-    tail = alloc.dumpEvictionIterator(poolId, 1 /* second allocation class, TODO: CompressedPtr */,
+    tail = alloc.dumpEvictionIterator(poolId, 0 /* first allocation class */,
                                       3 /* last 3 items */);
     ASSERT_NE(alloc.find(key3), nullptr);
-    tail = alloc.dumpEvictionIterator(poolId, 1 /* second allocation class, TODO: CompressedPtr */,
+    tail = alloc.dumpEvictionIterator(poolId, 0 /* first allocation class */,
                                       3 /* last 3 items */);
     // tail age 15, lru refresh time 6 * 0.7 = 4.2 = 4,
     // item 3 age 6 gets promoted
     EXPECT_TRUE(checkItemKey(tail[1], key1));
 
-    alloc.remove("key2-some-random-string-here");
+    alloc.remove("key2");
     sleep(3);
 
     ASSERT_NE(alloc.find(key3), nullptr);
-    tail = alloc.dumpEvictionIterator(poolId, 1 /* second allocation class, TODO: CompressedPtr */,
+    tail = alloc.dumpEvictionIterator(poolId, 0 /* second allocation class */,
                                       2 /* last 2 items */);
     // tail age 9, lru refresh time 4, item 3 age 3, not promoted
     EXPECT_TRUE(checkItemKey(tail[1], key3));

diff --git a/cachelib/benchmarks/PtrCompressionBench.cpp b/cachelib/benchmarks/PtrCompressionBench.cpp
@@ -61,7 +61,7 @@ void buildAllocs(size_t poolSize) {
         void* alloc = ma->allocate(pid, size);
         XDCHECK_GE(size, CompressedPtr::getMinAllocSize());
         if (alloc != nullptr) {
-          validAllocs.push_back({alloc, ma->compress(alloc)});
+          validAllocs.push_back({alloc, ma->compress(alloc, false)});
           validAllocsAlt.push_back({alloc, ma->compressAlt(alloc)});
           numAllocations++;
         }
@@ -83,7 +83,7 @@ BENCHMARK(CompressionAlt) {
 
 BENCHMARK_RELATIVE(Compression) {
   for (const auto& alloc : validAllocs) {
-    CompressedPtr c = m->compress(alloc.first);
+    CompressedPtr c = m->compress(alloc.first, false);
     folly::doNotOptimizeAway(c);
   }
 }
@@ -97,7 +97,7 @@ BENCHMARK(DeCompressAlt) {
 
 BENCHMARK_RELATIVE(DeCompress) {
   for (const auto& alloc : validAllocs) {
-    void* ptr = m->unCompress(alloc.second);
+    void* ptr = m->unCompress(alloc.second, false);
     folly::doNotOptimizeAway(ptr);
   }
 }

diff --git a/run_tests.sh b/run_tests.sh
@@ -2,6 +2,7 @@
 
 # Newline separated list of tests to ignore
 BLACKLIST="allocator-test-NavySetupTest
+allocator-test-NvmCacheTests
 shm-test-test_page_size"
 
 if [ "$1" == "long" ]; then