Skip to content

Commit

Permalink
db: keep up to one memtable for recycling
Browse files Browse the repository at this point in the history
We've observed large allocations like the 64MB memtable allocation take 10ms+.
This can add latency to the WAL/memtable rotation critical section during which
the entire commit pipeline is stalled, contributing to batch commit tail
latencies. This commit adapts the memtable lifecycle to keep the most recent
obsolete memtable around for use as the next mutable memtable.

This reduces the commit latency hiccup during a memtable rotation, and it also
reduces block cache mutex contention (cockroachdb#1997) by reducing the number of times we
must reserve memory from the block cache.

```
goos: linux
goarch: amd64
pkg: github.com/cockroachdb/pebble
cpu: Intel(R) Xeon(R) CPU @ 2.30GHz
                   │   old.txt   │               new.txt               │
                   │   sec/op    │   sec/op     vs base                │
RotateMemtables-24   120.7µ ± 2%   102.8µ ± 4%  -14.85% (p=0.000 n=25)

                   │   old.txt    │               new.txt               │
                   │     B/op     │     B/op      vs base               │
RotateMemtables-24   124.3Ki ± 0%   124.0Ki ± 0%  -0.27% (p=0.000 n=25)

                   │  old.txt   │              new.txt              │
                   │ allocs/op  │ allocs/op   vs base               │
RotateMemtables-24   114.0 ± 0%   111.0 ± 0%  -2.63% (p=0.000 n=25)
```

Informs cockroachdb#2646.
  • Loading branch information
jbowens committed Jul 28, 2023
1 parent 7ef7553 commit 87f0a80
Show file tree
Hide file tree
Showing 6 changed files with 141 additions and 42 deletions.
77 changes: 62 additions & 15 deletions db.go
Original file line number Diff line number Diff line change
Expand Up @@ -253,9 +253,14 @@ func (d defaultCPUWorkGranter) CPUWorkDone(_ CPUWorkHandle) {}
type DB struct {
// The count and size of referenced memtables. This includes memtables
// present in DB.mu.mem.queue, as well as memtables that have been flushed
// but are still referenced by an inuse readState.
// but are still referenced by an inuse readState, as well as up to one
// memTable waiting to be reused and stored in d.memTableRecycle.
memTableCount atomic.Int64
memTableReserved atomic.Int64 // number of bytes reserved in the cache for memtables
// memTableRecycle holds a pointer to an obsolete memtable. The next
// memtable allocation will reuse this memtable if it has not already been
// recycled.
memTableRecycle atomic.Pointer[memTable]

// The size of the current log file (i.e. db.mu.log.queue[len(queue)-1].
logSize atomic.Uint64
Expand Down Expand Up @@ -1540,6 +1545,10 @@ func (d *DB) Close() error {
// replay.
mem.readerUnrefLocked(false)
}
// If there's an unused, recycled memtable, we need to release its memory.
if obsoleteMemTable := d.memTableRecycle.Swap(nil); obsoleteMemTable != nil {
d.freeMemTable(obsoleteMemTable)
}
if reserved := d.memTableReserved.Load(); reserved != 0 {
err = firstError(err, errors.Errorf("leaked memtable reservation: %d", errors.Safe(reserved)))
}
Expand Down Expand Up @@ -2175,30 +2184,68 @@ func (d *DB) newMemTable(logNum FileNum, logSeqNum uint64) (*memTable, *flushabl
}
}

d.memTableCount.Add(1)
d.memTableReserved.Add(int64(size))
releaseAccountingReservation := d.opts.Cache.Reserve(size)

mem := newMemTable(memTableOptions{
memtblOpts := memTableOptions{
Options: d.opts,
arenaBuf: manual.New(int(size)),
logSeqNum: logSeqNum,
})
}

// Note: this is a no-op if invariants are disabled or race is enabled.
invariants.SetFinalizer(mem, checkMemTable)
// Before attempting to allocate a new memtable, check if there's one
// available for recycling in memTableRecycle. Large contiguous allocations
// can be costly as fragmentation makes it more difficult to find a large
// contiguous free space. We've observed 64MB allocations taking 10ms+.
//
// To reduce these costly allocations, up to 1 obsolete memtable is stashed
// in `d.memTableRecycle` to allow a future memtable rotation to reuse
// existing memory.
var mem *memTable
mem = d.memTableRecycle.Swap(nil)
if mem != nil && len(mem.arenaBuf) != size {
d.freeMemTable(mem)
mem = nil
}
if mem != nil {
// Carry through the existing buffer and memory reservation.
memtblOpts.arenaBuf = mem.arenaBuf
memtblOpts.releaseAccountingReservation = mem.releaseAccountingReservation
} else {
mem = new(memTable)
memtblOpts.arenaBuf = manual.New(int(size))
memtblOpts.releaseAccountingReservation = d.opts.Cache.Reserve(size)
d.memTableCount.Add(1)
d.memTableReserved.Add(int64(size))

// Note: this is a no-op if invariants are disabled or race is enabled.
invariants.SetFinalizer(mem, checkMemTable)
}
mem.init(memtblOpts)

entry := d.newFlushableEntry(mem, logNum, logSeqNum)
entry.releaseMemAccounting = func() {
manual.Free(mem.arenaBuf)
mem.arenaBuf = nil
d.memTableCount.Add(-1)
d.memTableReserved.Add(-int64(size))
releaseAccountingReservation()
// If the user leaks iterators, we may be releasing the memtable after
// the DB is already closed. In this case, we want to just release the
// memory because DB.Close won't come along to free it for us.
if err := d.closed.Load(); err != nil {
d.freeMemTable(mem)
return
}

// The next memtable allocation might be able to reuse this memtable.
// Stash it on d.memTableRecycle.
if unusedMem := d.memTableRecycle.Swap(mem); unusedMem != nil {
// There was already a memtable waiting to be recycled. We're now
// responsible for freeing it.
d.freeMemTable(unusedMem)
}
}
return mem, entry
}

func (d *DB) freeMemTable(m *memTable) {
d.memTableCount.Add(-1)
d.memTableReserved.Add(-int64(len(m.arenaBuf)))
m.free()
}

func (d *DB) newFlushableEntry(f flushable, logNum FileNum, logSeqNum uint64) *flushableEntry {
fe := &flushableEntry{
flushable: f,
Expand Down
45 changes: 39 additions & 6 deletions db_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -827,20 +827,34 @@ func TestMemTableReservation(t *testing.T) {
t.Fatalf("expected failure, but found success: %s", h.Get())
}

// Flush the memtable. The memtable reservation should be unchanged because a
// new memtable will be allocated.
// Flush the memtable. The memtable reservation should double because old
// memtable will be recycled, saved for the next memtable allocation.
require.NoError(t, d.Flush())
checkReserved(int64(opts.MemTableSize))
checkReserved(int64(2 * opts.MemTableSize))
// Flush again. The memtable reservation should be unchanged because at most
// 1 memtable may be preserved for recycling.

// Flush in the presence of an active iterator. The iterator will hold a
// reference to a readState which will in turn hold a reader reference to the
// memtable. While the iterator is open, there will be the memory for 2
// memtables reserved.
// memtable.
iter := d.NewIter(nil)
require.NoError(t, d.Flush())
// The flush moved the recycled memtable into position as an active mutable
// memtable. There are now two allocated memtables: 1 mutable and 1 pinned
// by the iterator's read state.
checkReserved(2 * int64(opts.MemTableSize))

// Flushing again should increase the reservation total to 3x: 1 active
// mutable, 1 for recycling, 1 pinned by iterator's read state.
require.NoError(t, d.Flush())
checkReserved(3 * int64(opts.MemTableSize))

// Closing the iterator will release the iterator's read state, and the old
// memtable will be moved into position as the next memtable to recycle.
// There was already a memtable ready to be recycled, so that memtable will
// be freed and the overall reservation total is reduced to 2x.
require.NoError(t, iter.Close())
checkReserved(int64(opts.MemTableSize))
checkReserved(2 * int64(opts.MemTableSize))

require.NoError(t, d.Close())
}
Expand Down Expand Up @@ -1935,3 +1949,22 @@ func verifyGetNotFound(t *testing.T, r Reader, key []byte) {
t.Fatalf("expected nil, but got %s", val)
}
}

func BenchmarkRotateMemtables(b *testing.B) {
o := &Options{FS: vfs.NewMem(), MemTableSize: 64 << 20 /* 64 MB */}
d, err := Open("", o)
require.NoError(b, err)

// We want to jump to full-sized memtables.
d.mu.Lock()
d.mu.mem.nextSize = o.MemTableSize
d.mu.Unlock()
require.NoError(b, d.Flush())

b.ResetTimer()
for i := 0; i < b.N; i++ {
if err := d.Flush(); err != nil {
b.Fatal(err)
}
}
}
40 changes: 28 additions & 12 deletions mem_table.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import (
"github.com/cockroachdb/pebble/internal/arenaskl"
"github.com/cockroachdb/pebble/internal/base"
"github.com/cockroachdb/pebble/internal/keyspan"
"github.com/cockroachdb/pebble/internal/manual"
"github.com/cockroachdb/pebble/internal/rangedel"
"github.com/cockroachdb/pebble/internal/rangekey"
)
Expand Down Expand Up @@ -84,17 +85,27 @@ type memTable struct {
rangeKeys keySpanCache
// The current logSeqNum at the time the memtable was created. This is
// guaranteed to be less than or equal to any seqnum stored in the memtable.
logSeqNum uint64
logSeqNum uint64
releaseAccountingReservation func()
}

func (m *memTable) free() {
if m != nil {
m.releaseAccountingReservation()
manual.Free(m.arenaBuf)
m.arenaBuf = nil
}
}

// memTableOptions holds configuration used when creating a memTable. All of
// the fields are optional and will be filled with defaults if not specified
// which is used by tests.
type memTableOptions struct {
*Options
arenaBuf []byte
size int
logSeqNum uint64
arenaBuf []byte
size int
logSeqNum uint64
releaseAccountingReservation func()
}

func checkMemTable(obj interface{}) {
Expand All @@ -109,16 +120,22 @@ func checkMemTable(obj interface{}) {
// Options.MemTableSize is used instead.
func newMemTable(opts memTableOptions) *memTable {
opts.Options = opts.Options.EnsureDefaults()
m := new(memTable)
m.init(opts)
return m
}

func (m *memTable) init(opts memTableOptions) {
if opts.size == 0 {
opts.size = opts.MemTableSize
}

m := &memTable{
cmp: opts.Comparer.Compare,
formatKey: opts.Comparer.FormatKey,
equal: opts.Comparer.Equal,
arenaBuf: opts.arenaBuf,
logSeqNum: opts.logSeqNum,
*m = memTable{
cmp: opts.Comparer.Compare,
formatKey: opts.Comparer.FormatKey,
equal: opts.Comparer.Equal,
arenaBuf: opts.arenaBuf,
logSeqNum: opts.logSeqNum,
releaseAccountingReservation: opts.releaseAccountingReservation,
}
m.writerRefs.Store(1)
m.tombstones = keySpanCache{
Expand All @@ -143,7 +160,6 @@ func newMemTable(opts memTableOptions) *memTable {
m.rangeDelSkl.Reset(arena, m.cmp)
m.rangeKeySkl.Reset(arena, m.cmp)
m.reserved = arena.Size()
return m
}

func (m *memTable) writerRef() {
Expand Down
5 changes: 4 additions & 1 deletion metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,10 @@ type Metrics struct {
// The count of memtables.
Count int64
// The number of bytes present in zombie memtables which are no longer
// referenced by the current DB state but are still in use by an iterator.
// referenced by the current DB state. An unbounded number of memtables
// may be zombie if they're still in use by an iterator. One additional
// memtable may be zombie if it's no longer in use and waiting to be
// recycled.
ZombieSize uint64
// The count of zombie memtables.
ZombieCount int64
Expand Down
4 changes: 2 additions & 2 deletions testdata/event_listener
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ WAL: 1 files (27B) in: 48B written: 108B (125% overhead)
Flushes: 3
Compactions: 1 estimated debt: 2.0KB in progress: 0 (0B)
default: 1 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 multi-level: 0
MemTables: 1 (256KB) zombie: 0 (0B)
MemTables: 1 (256KB) zombie: 1 (256KB)
Zombie tables: 0 (0B)
Block cache: 6 entries (1.1KB) hit rate: 11.1%
Table cache: 1 entries (800B) hit rate: 40.0%
Expand Down Expand Up @@ -380,7 +380,7 @@ WAL: 1 files (29B) in: 82B written: 110B (34% overhead)
Flushes: 6
Compactions: 1 estimated debt: 4.0KB in progress: 0 (0B)
default: 1 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 multi-level: 0
MemTables: 1 (512KB) zombie: 0 (0B)
MemTables: 1 (512KB) zombie: 1 (512KB)
Zombie tables: 0 (0B)
Block cache: 12 entries (2.3KB) hit rate: 14.3%
Table cache: 1 entries (800B) hit rate: 50.0%
Expand Down
12 changes: 6 additions & 6 deletions testdata/metrics
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ WAL: 1 files (28B) in: 34B written: 84B (147% overhead)
Flushes: 2
Compactions: 1 estimated debt: 0B in progress: 0 (0B)
default: 1 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 multi-level: 0
MemTables: 1 (256KB) zombie: 1 (256KB)
MemTables: 1 (256KB) zombie: 2 (512KB)
Zombie tables: 2 (1.2KB)
Block cache: 5 entries (1.0KB) hit rate: 42.9%
Table cache: 2 entries (1.6KB) hit rate: 66.7%
Expand Down Expand Up @@ -180,7 +180,7 @@ WAL: 1 files (28B) in: 34B written: 84B (147% overhead)
Flushes: 2
Compactions: 1 estimated debt: 0B in progress: 0 (0B)
default: 1 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 multi-level: 0
MemTables: 1 (256KB) zombie: 1 (256KB)
MemTables: 1 (256KB) zombie: 2 (512KB)
Zombie tables: 1 (633B)
Block cache: 3 entries (528B) hit rate: 42.9%
Table cache: 1 entries (800B) hit rate: 66.7%
Expand Down Expand Up @@ -216,7 +216,7 @@ WAL: 1 files (28B) in: 34B written: 84B (147% overhead)
Flushes: 2
Compactions: 1 estimated debt: 0B in progress: 0 (0B)
default: 1 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 multi-level: 0
MemTables: 1 (256KB) zombie: 0 (0B)
MemTables: 1 (256KB) zombie: 1 (256KB)
Zombie tables: 0 (0B)
Block cache: 0 entries (0B) hit rate: 42.9%
Table cache: 0 entries (0B) hit rate: 66.7%
Expand Down Expand Up @@ -278,7 +278,7 @@ WAL: 1 files (93B) in: 116B written: 242B (109% overhead)
Flushes: 3
Compactions: 1 estimated debt: 2.8KB in progress: 0 (0B)
default: 1 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 multi-level: 0
MemTables: 1 (256KB) zombie: 0 (0B)
MemTables: 1 (256KB) zombie: 1 (256KB)
Zombie tables: 0 (0B)
Block cache: 0 entries (0B) hit rate: 42.9%
Table cache: 0 entries (0B) hit rate: 66.7%
Expand Down Expand Up @@ -324,7 +324,7 @@ WAL: 1 files (93B) in: 116B written: 242B (109% overhead)
Flushes: 3
Compactions: 2 estimated debt: 0B in progress: 0 (0B)
default: 2 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 multi-level: 0
MemTables: 1 (256KB) zombie: 0 (0B)
MemTables: 1 (256KB) zombie: 1 (256KB)
Zombie tables: 0 (0B)
Block cache: 0 entries (0B) hit rate: 27.3%
Table cache: 0 entries (0B) hit rate: 58.3%
Expand Down Expand Up @@ -416,7 +416,7 @@ WAL: 1 files (26B) in: 176B written: 175B (-1% overhead)
Flushes: 8
Compactions: 2 estimated debt: 4.8KB in progress: 0 (0B)
default: 2 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 multi-level: 0
MemTables: 1 (1.0MB) zombie: 0 (0B)
MemTables: 1 (1.0MB) zombie: 1 (1.0MB)
Zombie tables: 0 (0B)
Block cache: 12 entries (2.3KB) hit rate: 31.1%
Table cache: 3 entries (2.3KB) hit rate: 57.9%
Expand Down

0 comments on commit 87f0a80

Please sign in to comment.